Java 类org.jsoup.nodes.Entities 实例源码
项目:common
文件:CleanerTest.java
@Test public void supplyOutputSettings() {
// test that one can override the default document output settings
Document.OutputSettings os = new Document.OutputSettings();
os.prettyPrint(false);
os.escapeMode(Entities.EscapeMode.extended);
os.charset("ascii");
String html = "<div><p>ℬ</p></div>";
String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
assertNotSame(defaultOut, customOut);
assertEquals("<div><p>ℬ</p></div>", customOut);
assertEquals("<div>\n" +
" <p>ℬ</p>\n" +
"</div>", defaultOut);
os.charset("ASCII");
os.escapeMode(Entities.EscapeMode.base);
String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
assertEquals("<div><p>ℬ</p></div>", customOut2);
}
项目:greenpepper
文件:GreenPepperRepositoryTest.java
private void assertSpecification( String expectedSpec, Document actualDoc)
{
assertNotNull( actualDoc );
StringWriter buffer = new StringWriter();
actualDoc.print( new PrintWriter( buffer ) );
org.jsoup.nodes.Document expectedDoc = Jsoup.parse(expectedSpec);
expectedDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);
Element expected = expectedDoc.body();
org.jsoup.nodes.Document resultDoc = Jsoup.parse(buffer.toString());
Element result = resultDoc.body();
result.select("style:first-of-type").remove();
resultDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);
assertEquals( expected.outerHtml(), result.outerHtml() );
}
项目:greenpepper
文件:AtlassianRepositoryTest.java
private void assertSpecification( Document doc )
{
assertNotNull( doc );
StringWriter buffer = new StringWriter();
doc.print( new PrintWriter( buffer ) );
org.jsoup.nodes.Document expectedDoc = Jsoup.parse(specification());
expectedDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);
Element expected = expectedDoc.body();
org.jsoup.nodes.Document resultDoc = Jsoup.parse(buffer.toString());
Element result = resultDoc.body();
result.select("style:first-of-type").remove();
resultDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);
Assert.assertEquals( expected.outerHtml(), result.outerHtml() );
}
项目:CN1ML-NetbeansModule
文件:CleanerTest.java
@Test public void supplyOutputSettings() {
// test that one can override the default document output settings
Document.OutputSettings os = new Document.OutputSettings();
os.prettyPrint(false);
os.escapeMode(Entities.EscapeMode.extended);
os.charset("ascii");
String html = "<div><p>ℬ</p></div>";
String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
assertNotSame(defaultOut, customOut);
assertEquals("<div><p>ℬ</p></div>", customOut);
assertEquals("<div>\n" +
" <p>ℬ</p>\n" +
"</div>", defaultOut);
os.charset("ASCII");
os.escapeMode(Entities.EscapeMode.base);
String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
assertEquals("<div><p>ℬ</p></div>", customOut2);
}
项目:astor
文件:CleanerTest.java
@Test public void supplyOutputSettings() {
// test that one can override the default document output settings
Document.OutputSettings os = new Document.OutputSettings();
os.prettyPrint(false);
os.escapeMode(Entities.EscapeMode.extended);
os.charset("ascii");
String html = "<div><p>ℬ</p></div>";
String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
assertNotSame(defaultOut, customOut);
assertEquals("<div><p>ℬ</p></div>", customOut);
assertEquals("<div>\n" +
" <p>ℬ</p>\n" +
"</div>", defaultOut);
os.charset("ASCII");
os.escapeMode(Entities.EscapeMode.base);
String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
assertEquals("<div><p>ℬ</p></div>", customOut2);
}
项目:astor
文件:CleanerTest.java
@Test public void supplyOutputSettings() {
// test that one can override the default document output settings
Document.OutputSettings os = new Document.OutputSettings();
os.prettyPrint(false);
os.escapeMode(Entities.EscapeMode.extended);
os.charset("ascii");
String html = "<div><p>ℬ</p></div>";
String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
assertNotSame(defaultOut, customOut);
assertEquals("<div><p>ℬ</p></div>", customOut); // entities now prefers shorted names if aliased
assertEquals("<div>\n" +
" <p>ℬ</p>\n" +
"</div>", defaultOut);
os.charset("ASCII");
os.escapeMode(Entities.EscapeMode.base);
String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
assertEquals("<div><p>ℬ</p></div>", customOut2);
}
项目:astor
文件:CleanerTest.java
@Test public void supplyOutputSettings() {
// test that one can override the default document output settings
Document.OutputSettings os = new Document.OutputSettings();
os.prettyPrint(false);
os.escapeMode(Entities.EscapeMode.extended);
os.charset("ascii");
String html = "<div><p>ℬ</p></div>";
String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
assertNotSame(defaultOut, customOut);
assertEquals("<div><p>ℬ</p></div>", customOut); // entities now prefers shorted names if aliased
assertEquals("<div>\n" +
" <p>ℬ</p>\n" +
"</div>", defaultOut);
os.charset("ASCII");
os.escapeMode(Entities.EscapeMode.base);
String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
assertEquals("<div><p>ℬ</p></div>", customOut2);
}
项目:lyrics
文件:KaraokeTexty.java
@Override
public String parsing() {
String output="";
try {
Document doc=Jsoup.connect(super.getURL()).get();
Elements lyr=doc.select("p.text");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
try {
output=lyr.first().html();
} catch(NullPointerException npe) { System.err.println(npe);}
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
return output;
}
项目:lyrics
文件:LyricWiki.java
@Override
public String parsing() {
String output="";
try {
Document doc=Jsoup.connect(super.getURL()).get();
doc.select(".rtMatcher").remove(); doc.select(".lyricsBreak").remove();
doc.select("script").remove(); Library.removeComments(doc);
Elements lyr=doc.select(".lyricbox");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
output=lyr.html();
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
if(output.contains("<span") && output.contains("title=\"Instrumental\"")) {
output="This is an instrumental song with no lyrics.";
}
if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
return output;
}
项目:jsoup-learning
文件:CleanerTest.java
@Test public void supplyOutputSettings() {
// test that one can override the default document output settings
Document.OutputSettings os = new Document.OutputSettings();
os.prettyPrint(false);
os.escapeMode(Entities.EscapeMode.extended);
String html = "<div><p>ℬ</p></div>";
String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
assertNotSame(defaultOut, customOut);
assertEquals("<div><p>ℬ</p></div>", customOut);
assertEquals("<div>\n" +
" <p>ℬ</p>\n" +
"</div>", defaultOut);
os.charset("ASCII");
os.escapeMode(Entities.EscapeMode.base);
String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
assertEquals("<div><p>ℬ</p></div>", customOut2);
}
项目:lyrics
文件:KaraokeTexty.java
@Override
public String parsing() {
String output="";
try {
Document doc=Jsoup.connect(super.getURL()).get();
Elements lyr=doc.select("p.text");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
try {
output=lyr.first().html();
} catch(NullPointerException npe) { System.err.println(npe);}
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
return output;
}
项目:lyrics
文件:LyricWiki.java
@Override
public String parsing() {
String output="";
try {
Document doc=Jsoup.connect(super.getURL()).get();
doc.select(".rtMatcher").remove(); doc.select(".lyricsBreak").remove();
doc.select("script").remove(); Library.removeComments(doc);
Elements lyr=doc.select(".lyricbox");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
output=lyr.html();
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
if(output.contains("<span") && output.contains("title=\"Instrumental\"")) {
output="This is an instrumental song with no lyrics.";
}
if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
return output;
}
项目:metka
文件:DDIReadSectionBase.java
protected <T extends XmlObject> String getText(T att) {
if(att == null) return "";
Document doc = Jsoup.parse(att.xmlText());
doc.outputSettings().syntax(Document.OutputSettings.Syntax.html);
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
List<Node> childNodes = doc.body().childNodes();
if ( childNodes.size() == 1 && "#text".equals(childNodes.get(0).nodeName())) {
// only text, parse as plaintext.
XmlCursor cursor = att.newCursor();
String value = cursor.getTextValue();
cursor.dispose();
return value == null ? "" : value;
} else {
StringBuilder sb = new StringBuilder();
for (Node child : childNodes) {
child.traverse(new DDIReadNodeVisitor());
child.html(sb);
}
return sb.toString();
}
}
项目:docx4j-template
文件:XHTMLImporterUtils.java
public static WordprocessingMLPackage handle(WordprocessingMLPackage wmlPackage, Document doc,boolean fragment,boolean altChunk) throws IOException, Docx4JException {
//设置转换模式
doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式
if(altChunk){
//Document对象
MainDocumentPart document = wmlPackage.getMainDocumentPart();
//获取Jsoup参数
String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
//设置转换模式
doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式
//创建html导入对象
//XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);
document.addAltChunk(AltChunkType.Xhtml, (fragment ? doc.body().html() : doc.html()) .getBytes(Charset.forName(charsetName)));
//document.addAltChunk(type, bytes, attachmentPoint)
//document.addAltChunk(type, is)
//document.addAltChunk(type, is, attachmentPoint)
WordprocessingMLPackage tempPackage = document.convertAltChunks();
//返回处理后的WordprocessingMLPackage对象
return tempPackage;
}
//创建html导入对象
XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wmlPackage);
//将xhtml转换为wmlPackage可用的对象
List<Object> list = xhtmlImporter.convert((fragment ? doc.body().html() : doc.html()), doc.baseUri());
//导入转换后的内容对象
wmlPackage.getMainDocumentPart().getContent().addAll(list);
//返回原WordprocessingMLPackage对象
return wmlPackage;
}
项目:docx4j-template
文件:HtmlConverter.java
/**
* 将页面转为{@link org.jsoup.nodes.Document}对象,xhtml 格式
*
* @param url
* @return
* @throws Exception
*/
protected Document url2xhtml(String url) throws Exception {
Document doc = Jsoup.connect(url).get(); //获得
if (logger.isDebugEnabled()) {
logger.debug("baseUri: {}", doc.baseUri());
}
for (Element script : doc.getElementsByTag("script")) { //除去所有 script
script.remove();
}
for (Element a : doc.getElementsByTag("a")) { //除去 a 的 onclick,href 属性
a.removeAttr("onclick");
a.removeAttr("href");
}
Elements links = doc.getElementsByTag("link"); //将link中的地址替换为绝对地址
for (Element element : links) {
String href = element.absUrl("href");
if (logger.isDebugEnabled()) {
logger.debug("href: {} -> {}", element.attr("href"), href);
}
element.attr("href", href);
}
doc.outputSettings()
.syntax(Document.OutputSettings.Syntax.xml)
.escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式
if (logger.isDebugEnabled()) {
String[] split = doc.html().split("\n");
for (int c = 0; c < split.length; c++) {
logger.debug("line {}:\t{}", c + 1, split[c]);
}
}
return doc;
}
项目:docx4j-template
文件:HtmlToDOCDemo.java
private static List<Object> convertToWmlObject(
WordprocessingMLPackage wordMLPackage, String content)
throws Docx4JException, JAXBException {
MainDocumentPart document = wordMLPackage.getMainDocumentPart();
//获取Jsoup参数
String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_CONVERT_OUT_WMLTEMPLATE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
List<Object> wmlObjList = null;
String templateString = XmlUtils.marshaltoString(document.getContents().getBody());
System.out.println(templateString);
Body templateBody = document.getContents().getBody();
try {
document.getContents().setBody(XmlUtils.deepCopy(templateBody));
document.getContent().clear();
Document doc = Jsoup.parse(content);
doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);
//XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);
AlternativeFormatInputPart part = document.addAltChunk(AltChunkType.Xhtml,doc.html().getBytes(Charset.forName(charsetName)));
WordprocessingMLPackage tempPackage = document.convertAltChunks();
File file = new File("d://temp.docx");
tempPackage.save(file);
wmlObjList = document.getContent();
//part.getOwningRelationshipPart().getSourceP().get
//wmlObjList = xhtmlImporter.convert(doc.html(), doc.baseUri());
} finally {
document.getContents().setBody(templateBody);
}
return wmlObjList;
}
项目:fastcrawler
文件:HtmlData.java
private void disableJsoupHtmlEntityEscape() {
if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
Entities.EscapeMode.base.getMap().clear();
Entities.EscapeMode.extended.getMap().clear();
Entities.EscapeMode.xhtml.getMap().clear();
INITED = true;
}
}
项目:Android-App
文件:NewsArticle.java
/**
* Method to handle the formatting of the news article's body. In here Jsoup is used to remove
* the web article header, as well as regex overrides for "-Read-More-" and "-End-" tags, and an
* override to adjust text size per the users currently set text size.
*
* @param html Unformatted HTML String, usually straight from the parser or Volley's cache
* @return Formatted String, ready to be placed within NewsDetailActivity's WebView, or other
*/
public static String formatContent(String html) {
Document resultD = Jsoup.parse(html);
resultD.outputSettings().charset("ASCII");
resultD.outputSettings().escapeMode(Entities.EscapeMode.extended);
resultD.outputSettings().prettyPrint(false);
// Select only the content, removing the web header
String result = resultD.getElementsByTag("table").last()
.getElementsByTag("tr").get(1)
.getElementsByTag("td").get(1)
.html();
// Removing the -End- and -Read-More- tags created by fccms.psdr3.org
result = result.replaceFirst("<div.+-End-.+<\\/div>", "");
result = result.replaceFirst("<div.+-Read-More-.+<\\/div>", "");
// Overriding the text size. Hard coded "15" can be changed as the scalar quantity.
int fontScale = (int) (15 * Resources.getSystem().getConfiguration().fontScale);
result = result.replaceAll("font-size:\\d+pt;", "font-size:" + fontScale + "px;");
// Add an extra line to the HTML to make the content pad well at the bottom of the WebView
result = result.concat("<br>");
return result;
}
项目:zongtui-webcrawler
文件:Html.java
/**
* Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
*/
private void disableJsoupHtmlEntityEscape() {
if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
Entities.EscapeMode.base.getMap().clear();
Entities.EscapeMode.extended.getMap().clear();
INITED = true;
}
}
项目:herd
文件:HerdStringUtils.java
/**
* Strips HTML tags from a given input String, allows some tags to be retained via a whitelist
*
* @param fragment the specified String
* @param whitelistTags the specified whitelist tags
*
* @return cleaned String with allowed tags
*/
public static String stripHtml(String fragment, String... whitelistTags)
{
// Parse out html tags except those from a given list of whitelist tags
Document dirty = Jsoup.parseBodyFragment(fragment);
Whitelist whitelist = new Whitelist();
for (String whitelistTag : whitelistTags)
{
// Get the actual tag name from the whitelist tag
// this is vulnerable in general to complex tags but will suffice for our simple needs
whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]");
// Add all specified tags to the whitelist while preserving inline css
whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class");
}
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
// Set character encoding to UTF-8 and make sure no line-breaks are added
clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false);
// return 'cleaned' html body
return clean.body().html();
}
项目:astor
文件:HtmlParserTest.java
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() {
// extended entities need a ; at the end to match, base does not
String html = "& " ® &icy &hopf и 𝕙";
Document doc = Jsoup.parse(html);
doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test
assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html());
}
项目:astor
文件:HtmlParserTest.java
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() {
// extended entities need a ; at the end to match, base does not
String html = "& " ® &icy &hopf и 𝕙";
Document doc = Jsoup.parse(html);
doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test
assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html());
}
项目:lyrics
文件:KaraokeTexty.java
@Override
public void makeURL() {
String query=super.getArtist()+" - "+super.getSong();
String searchURL="";
try {
URI uri=new URI("http","www.karaoketexty.cz","/search","q="+super.getSong(),null);
searchURL=uri.toASCIIString().replace("&","%26");
} catch(URISyntaxException use) { System.err.println(use); }
try {
Document doc=Jsoup.connect(searchURL).get();
Elements links=doc.select("#search > ul.title > li > a");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
for(Element link:links) {
String resultText=Library.replacing(link.text());
if(resultText.equalsIgnoreCase(query)) {
super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
return;
}
else if(resultText.contains(query)) {
super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
return;
}
}
super.setURL("http://www.karaoketexty.cz/search?q="+query);
} catch(IOException ioe) { System.err.println(ioe); }
}
项目:lyrics
文件:MetroLyrics.java
@Override
public String parsing() {
String output="";
try {
Document doc=Jsoup.connect(super.getURL()).get();
Elements lyr=doc.select("#lyrics-body-text");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
output=lyr.html().replace("<p class=\"verse\">","");
output=output.replace("</p>","<br/><br/>");
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
return output;
}
项目:lyrics
文件:Lastfm.java
/**
* Obtains information.
* @param method Method which is needed to call.
* @param info Information which is needed to obtain.
* @return Information.
*/
public String obtainInformation(String method,String info) {
String url=createAPIrequestURL(method);
String output="";
try {
Document doc=Jsoup.connect(url).get();
Elements lyr=doc.select(info);
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
output=lyr.first().html();
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
return output;
}
项目:jcabi-http
文件:JsoupResponse.java
@Override
public String body() {
final Document html = Jsoup.parse(super.body());
html.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
html.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
return html.html();
}
项目:yarg
文件:HtmlImportProcessorImpl.java
@Override
public String processHtml(String source) {
org.jsoup.nodes.Document document = Jsoup.parse(source);
processHtmlDocument(document);
document.outputSettings()
.syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml)
.prettyPrint(false)
.escapeMode(Entities.EscapeMode.xhtml);
return document.html();
}
项目:Tanaguru
文件:Rgaa3Extractor.java
private static void createTestcaseFiles() throws IOException {
File srcDir = new File(RGAA3_TESTCASE_PATH);
for (File file : srcDir.listFiles()) {
String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", "");
String theme = fileName.substring(0, 2);
String crit = fileName.substring(2, 4);
String test = fileName.substring(4, 6);
String testKey = Integer.valueOf(theme).toString()+"-"+Integer.valueOf(crit).toString()+"-"+Integer.valueOf(test).toString();
String wrongKey = theme+"."+crit+"."+test;
for (File testcase : file.listFiles()) {
if (testcase.isFile() && testcase.getName().contains("html")) {
Document doc = Jsoup.parse(FileUtils.readFileToString(testcase));
Element detail = doc.select(".test-detail").first();
if (detail == null) {
System.out.println(doc.outerHtml());
} else {
detail.tagName("div");
detail.text("");
for (Element el : detail.children()) {
el.remove();
}
if (!detail.hasAttr("lang")) {
detail.attr("lang", "fr");
}
detail.append("\n"+RGAA3.get(testKey).ruleRawHtml+"\n");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
doc.outputSettings().outline(false);
doc.outputSettings().indentAmount(4);
String outputHtml = doc.outerHtml();
if (outputHtml.contains(wrongKey)) {
outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot());
}
FileUtils.writeStringToFile(testcase, outputHtml);
}
}
}
}
}
项目:Tanaguru
文件:HTMLJsoupCleanerImpl.java
@Override
public void run() {
dirtyHTML = removeBadNamespaceDefinition(dirtyHTML);
Document doc = Jsoup.parse(dirtyHTML);
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
doc.outputSettings().outline(true);
doc.outputSettings().indentAmount(2);
removeComments(doc);
removeMalformedAttributes(doc);
result = doc.outerHtml();
}
项目:lyrics
文件:KaraokeTexty.java
@Override
public void makeURL() {
String query=super.getArtist()+" - "+super.getSong();
String searchURL="";
try {
URI uri=new URI("http","www.karaoketexty.cz","/search","q="+super.getSong(),null);
searchURL=uri.toASCIIString().replace("&","%26");
} catch(URISyntaxException use) { System.err.println(use); }
try {
Document doc=Jsoup.connect(searchURL).get();
Elements links=doc.select("#search > ul.title > li > a");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
for(Element link:links) {
String resultText=Library.replacing(link.text());
if(resultText.equalsIgnoreCase(query)) {
super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
return;
}
else if(resultText.contains(query)) {
super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
return;
}
}
super.setURL("http://www.karaoketexty.cz/search?q="+query);
} catch(IOException ioe) { System.err.println(ioe); }
}
项目:lyrics
文件:MetroLyrics.java
@Override
public String parsing() {
String output="";
try {
Document doc=Jsoup.connect(super.getURL()).get();
Elements lyr=doc.select("#lyrics-body-text");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
output=lyr.html().replace("<p class=\"verse\">","");
output=output.replace("</p>","<br/><br/>");
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
return output;
}
项目:lyrics
文件:Lastfm.java
/**
* Obtains information.
* @param method Method which is needed to call.
* @param info Information which is needed to obtain.
* @return Information.
*/
public String obtainInformation(String method,String info) {
String url=createAPIrequestURL(method);
String output="";
try {
Document doc=Jsoup.connect(url).get();
Elements lyr=doc.select(info);
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
output=lyr.first().html();
output=Library.replacing(output);
} catch(IOException ioe) { System.err.println(ioe); }
return output;
}
项目:calendula
文件:LeafletHtmlPostProcessor.java
@Override
public String process(String html) {
// Parse str into a Document
Document doc = Jsoup.parseBodyFragment(html);
doc.select("nav").remove();
doc.select("div#pdfurl").remove();
// white list to clean html
Whitelist wl = Whitelist.relaxed();
wl.addTags("div", "span", "p", "h1", "h2", "h3", "ul", "ol", "li", "a", "img");
wl.preserveRelativeLinks(true);
wl.addAttributes("img", "src");
wl.addAttributes("a", "href");
// perform cleaning
Document cleaned = new Cleaner(wl).clean(doc);
cleaned.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
// Remove empty elements
Set<String> removable = new HashSet<>(Arrays.asList("div", "span", "strong", "p", "h1", "h2", "h3", "ul", "ol", "li", "a"));
cleaned.select("p:matchesOwn((?is) )").remove();
// For each element in the cleaned document
for (Element el : cleaned.getAllElements()) {
if (el.children().isEmpty() && (!el.hasText() || el.text().replaceAll("\u00a0", "").trim().equals(""))) {
// Element is empty, check if should be removed
if (removable.contains(el.tagName())) el.remove();
}
}
// return html for display
return cleaned.html();
}
项目:common
文件:Tokeniser.java
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAnySorted(notCharRefCharsSorted))
return null;
final char[] charRef = charRefHolder;
reader.mark();
if (reader.matchConsume("#")) { // numbered
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException e) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
charRef[0] = replacementChar;
return charRef;
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
charRef[0] = (char) charval;
return charRef;
} else
return Character.toChars(charval);
}
} else { // named
// get as many letters as possible, and look for matching entities.
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
// found if a base named entity without a ;, or an extended entity with the ;.
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
charRef[0] = Entities.getCharacterByName(nameRef);
return charRef;
}
}
项目:gestock
文件:Tokeniser.java
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAnySorted(notCharRefCharsSorted))
return null;
final char[] charRef = charRefHolder;
reader.mark();
if (reader.matchConsume("#")) { // numbered
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException e) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
charRef[0] = replacementChar;
return charRef;
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
charRef[0] = (char) charval;
return charRef;
} else
return Character.toChars(charval);
}
} else { // named
// get as many letters as possible, and look for matching entities.
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
// found if a base named entity without a ;, or an extended entity with the ;.
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
charRef[0] = Entities.getCharacterByName(nameRef);
return charRef;
}
}
项目:mygrades-app
文件:Scraper.java
/**
* Makes request to the given url with given request data and method.
* Follows redirects (including HTML redirects).
*
* @param requestData Map of key value pairs for request
* @param method Connection.Method - HTTP method
* @param url url as string
* @throws IOException if there is an error connecting to the url
*/
private void makeJsoupRequest(Map<String, String> requestData, Connection.Method method, String url) throws IOException, URISyntaxException {
Connection.Response response = Jsoup.connect(url)
.data(requestData)
.cookies(cookies)
.referrer(previousUrl) // some websites block without referrer
.userAgent(Config.BROWSER_USER_AGENT) // set explicit user agent
.method(method)
.timeout(Config.SCRAPER_TIMEOUT)
.followRedirects(false)
.execute();
// get cookies from response and add to all cookies
addNewCookies(response.cookies());
// get content from response
document = response.parse();
document.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
document.select("script").remove();
document.select("td:contains(aktuellen ECTS-Grades)").remove(); // remove invalid html (see error #71)
// check for location redirect
String location = response.header("location");
if (location != null) {
baseUri = new URL(location).toURI();
makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, location);
}
// check for meta refresh tag
Element meta = document.select("meta[http-equiv=Refresh").first();
if (meta != null) {
String content = meta.attr("content");
if (content != null) {
meta.attr("refresh-url", content.replaceAll("(?i)^(\\d+;.*URL=)(.+)$", "$2"));
makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, meta.absUrl("refresh-url"));
}
}
// check for refresh pseudo header
String refreshHeader = response.header("refresh");
if (refreshHeader != null) {
String relativeUrl = refreshHeader.replaceAll("(?i)^(\\d+;.*URL=)(.+)$", "$2");
String redirectUrl = StringUtil.resolve(document.baseUri(), relativeUrl);
makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, redirectUrl);
}
}
项目:CN1ML-NetbeansModule
文件:Tokeniser.java
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAny('\t', '\n', '\r', '\f', ' ', '<', '&'))
return null;
reader.mark();
if (reader.matchConsume("#")) { // numbered
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException e) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
return new char[]{replacementChar};
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
return Character.toChars(charval);
}
} else { // named
// get as many letters as possible, and look for matching entities.
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
// found if a base named entity without a ;, or an extended entity with the ;.
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
return new char[]{Entities.getCharacterByName(nameRef)};
}
}
项目:astor
文件:Tokeniser.java
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAnySorted(notCharRefCharsSorted))
return null;
final char[] charRef = charRefHolder;
reader.mark();
if (reader.matchConsume("#")) { // numbered
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException e) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
charRef[0] = replacementChar;
return charRef;
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
charRef[0] = (char) charval;
return charRef;
} else
return Character.toChars(charval);
}
} else { // named
// get as many letters as possible, and look for matching entities.
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
// found if a base named entity without a ;, or an extended entity with the ;.
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
charRef[0] = Entities.getCharacterByName(nameRef);
return charRef;
}
}
项目:astor
文件:Tokeniser.java
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAnySorted(notCharRefCharsSorted))
return null;
final int[] codeRef = codepointHolder;
reader.mark();
if (reader.matchConsume("#")) { // numbered
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException ignored) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
codeRef[0] = replacementChar;
return codeRef;
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
codeRef[0] = charval;
return codeRef;
}
} else { // named
// get as many letters as possible, and look for matching entities.
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
// found if a base named entity without a ;, or an extended entity with the ;.
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int numChars = Entities.codepointsForName(nameRef, multipointHolder);
if (numChars == 1) {
codeRef[0] = multipointHolder[0];
return codeRef;
} else if (numChars ==2) {
return multipointHolder;
} else {
Validate.fail("Unexpected characters returned for " + nameRef);
return multipointHolder;
}
}
}
项目:astor
文件:Tokeniser.java
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAnySorted(notCharRefCharsSorted))
return null;
final int[] codeRef = codepointHolder;
reader.mark();
if (reader.matchConsume("#")) { // numbered
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException ignored) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
codeRef[0] = replacementChar;
return codeRef;
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
codeRef[0] = charval;
return codeRef;
}
} else { // named
// get as many letters as possible, and look for matching entities.
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
// found if a base named entity without a ;, or an extended entity with the ;.
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int numChars = Entities.codepointsForName(nameRef, multipointHolder);
if (numChars == 1) {
codeRef[0] = multipointHolder[0];
return codeRef;
} else if (numChars ==2) {
return multipointHolder;
} else {
Validate.fail("Unexpected characters returned for " + nameRef);
return multipointHolder;
}
}
}