@Test public void supplyOutputSettings() { // test that one can override the default document output settings Document.OutputSettings os = new Document.OutputSettings(); os.prettyPrint(false); os.escapeMode(Entities.EscapeMode.extended); os.charset("ascii"); String html = "<div><p>ℬ</p></div>"; String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed()); assertNotSame(defaultOut, customOut); assertEquals("<div><p>ℬ</p></div>", customOut); assertEquals("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.charset("ASCII"); os.escapeMode(Entities.EscapeMode.base); String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); assertEquals("<div><p>ℬ</p></div>", customOut2); }
private void assertSpecification( String expectedSpec, Document actualDoc) { assertNotNull( actualDoc ); StringWriter buffer = new StringWriter(); actualDoc.print( new PrintWriter( buffer ) ); org.jsoup.nodes.Document expectedDoc = Jsoup.parse(expectedSpec); expectedDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false); Element expected = expectedDoc.body(); org.jsoup.nodes.Document resultDoc = Jsoup.parse(buffer.toString()); Element result = resultDoc.body(); result.select("style:first-of-type").remove(); resultDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false); assertEquals( expected.outerHtml(), result.outerHtml() ); }
private void assertSpecification( Document doc ) { assertNotNull( doc ); StringWriter buffer = new StringWriter(); doc.print( new PrintWriter( buffer ) ); org.jsoup.nodes.Document expectedDoc = Jsoup.parse(specification()); expectedDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false); Element expected = expectedDoc.body(); org.jsoup.nodes.Document resultDoc = Jsoup.parse(buffer.toString()); Element result = resultDoc.body(); result.select("style:first-of-type").remove(); resultDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false); Assert.assertEquals( expected.outerHtml(), result.outerHtml() ); }
@Test public void supplyOutputSettings() { // test that one can override the default document output settings Document.OutputSettings os = new Document.OutputSettings(); os.prettyPrint(false); os.escapeMode(Entities.EscapeMode.extended); os.charset("ascii"); String html = "<div><p>ℬ</p></div>"; String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed()); assertNotSame(defaultOut, customOut); assertEquals("<div><p>ℬ</p></div>", customOut); // entities now prefers shorted names if aliased assertEquals("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.charset("ASCII"); os.escapeMode(Entities.EscapeMode.base); String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); assertEquals("<div><p>ℬ</p></div>", customOut2); }
@Override public String parsing() { String output=""; try { Document doc=Jsoup.connect(super.getURL()).get(); Elements lyr=doc.select("p.text"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); try { output=lyr.first().html(); } catch(NullPointerException npe) { System.err.println(npe);} output=Library.replacing(output); } catch(IOException ioe) { System.err.println(ioe); } if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; } return output; }
@Override public String parsing() { String output=""; try { Document doc=Jsoup.connect(super.getURL()).get(); doc.select(".rtMatcher").remove(); doc.select(".lyricsBreak").remove(); doc.select("script").remove(); Library.removeComments(doc); Elements lyr=doc.select(".lyricbox"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); output=lyr.html(); output=Library.replacing(output); } catch(IOException ioe) { System.err.println(ioe); } if(output.contains("<span") && output.contains("title=\"Instrumental\"")) { output="This is an instrumental song with no lyrics."; } if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; } return output; }
@Test public void supplyOutputSettings() { // test that one can override the default document output settings Document.OutputSettings os = new Document.OutputSettings(); os.prettyPrint(false); os.escapeMode(Entities.EscapeMode.extended); String html = "<div><p>ℬ</p></div>"; String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed()); assertNotSame(defaultOut, customOut); assertEquals("<div><p>ℬ</p></div>", customOut); assertEquals("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.charset("ASCII"); os.escapeMode(Entities.EscapeMode.base); String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); assertEquals("<div><p>ℬ</p></div>", customOut2); }
protected <T extends XmlObject> String getText(T att) { if(att == null) return ""; Document doc = Jsoup.parse(att.xmlText()); doc.outputSettings().syntax(Document.OutputSettings.Syntax.html); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); List<Node> childNodes = doc.body().childNodes(); if ( childNodes.size() == 1 && "#text".equals(childNodes.get(0).nodeName())) { // only text, parse as plaintext. XmlCursor cursor = att.newCursor(); String value = cursor.getTextValue(); cursor.dispose(); return value == null ? "" : value; } else { StringBuilder sb = new StringBuilder(); for (Node child : childNodes) { child.traverse(new DDIReadNodeVisitor()); child.html(sb); } return sb.toString(); } }
public static WordprocessingMLPackage handle(WordprocessingMLPackage wmlPackage, Document doc,boolean fragment,boolean altChunk) throws IOException, Docx4JException { //设置转换模式 doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式 if(altChunk){ //Document对象 MainDocumentPart document = wmlPackage.getMainDocumentPart(); //获取Jsoup参数 String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME ); //设置转换模式 doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式 //创建html导入对象 //XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage); document.addAltChunk(AltChunkType.Xhtml, (fragment ? doc.body().html() : doc.html()) .getBytes(Charset.forName(charsetName))); //document.addAltChunk(type, bytes, attachmentPoint) //document.addAltChunk(type, is) //document.addAltChunk(type, is, attachmentPoint) WordprocessingMLPackage tempPackage = document.convertAltChunks(); //返回处理后的WordprocessingMLPackage对象 return tempPackage; } //创建html导入对象 XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wmlPackage); //将xhtml转换为wmlPackage可用的对象 List<Object> list = xhtmlImporter.convert((fragment ? doc.body().html() : doc.html()), doc.baseUri()); //导入转换后的内容对象 wmlPackage.getMainDocumentPart().getContent().addAll(list); //返回原WordprocessingMLPackage对象 return wmlPackage; }
/** * 将页面转为{@link org.jsoup.nodes.Document}对象,xhtml 格式 * * @param url * @return * @throws Exception */ protected Document url2xhtml(String url) throws Exception { Document doc = Jsoup.connect(url).get(); //获得 if (logger.isDebugEnabled()) { logger.debug("baseUri: {}", doc.baseUri()); } for (Element script : doc.getElementsByTag("script")) { //除去所有 script script.remove(); } for (Element a : doc.getElementsByTag("a")) { //除去 a 的 onclick,href 属性 a.removeAttr("onclick"); a.removeAttr("href"); } Elements links = doc.getElementsByTag("link"); //将link中的地址替换为绝对地址 for (Element element : links) { String href = element.absUrl("href"); if (logger.isDebugEnabled()) { logger.debug("href: {} -> {}", element.attr("href"), href); } element.attr("href", href); } doc.outputSettings() .syntax(Document.OutputSettings.Syntax.xml) .escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式 if (logger.isDebugEnabled()) { String[] split = doc.html().split("\n"); for (int c = 0; c < split.length; c++) { logger.debug("line {}:\t{}", c + 1, split[c]); } } return doc; }
private static List<Object> convertToWmlObject( WordprocessingMLPackage wordMLPackage, String content) throws Docx4JException, JAXBException { MainDocumentPart document = wordMLPackage.getMainDocumentPart(); //获取Jsoup参数 String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_CONVERT_OUT_WMLTEMPLATE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME ); List<Object> wmlObjList = null; String templateString = XmlUtils.marshaltoString(document.getContents().getBody()); System.out.println(templateString); Body templateBody = document.getContents().getBody(); try { document.getContents().setBody(XmlUtils.deepCopy(templateBody)); document.getContent().clear(); Document doc = Jsoup.parse(content); doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage); AlternativeFormatInputPart part = document.addAltChunk(AltChunkType.Xhtml,doc.html().getBytes(Charset.forName(charsetName))); WordprocessingMLPackage tempPackage = document.convertAltChunks(); File file = new File("d://temp.docx"); tempPackage.save(file); wmlObjList = document.getContent(); //part.getOwningRelationshipPart().getSourceP().get //wmlObjList = xhtmlImporter.convert(doc.html(), doc.baseUri()); } finally { document.getContents().setBody(templateBody); } return wmlObjList; }
private void disableJsoupHtmlEntityEscape() { if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) { Entities.EscapeMode.base.getMap().clear(); Entities.EscapeMode.extended.getMap().clear(); Entities.EscapeMode.xhtml.getMap().clear(); INITED = true; } }
/** * Method to handle the formatting of the news article's body. In here Jsoup is used to remove * the web article header, as well as regex overrides for "-Read-More-" and "-End-" tags, and an * override to adjust text size per the users currently set text size. * * @param html Unformatted HTML String, usually straight from the parser or Volley's cache * @return Formatted String, ready to be placed within NewsDetailActivity's WebView, or other */ public static String formatContent(String html) { Document resultD = Jsoup.parse(html); resultD.outputSettings().charset("ASCII"); resultD.outputSettings().escapeMode(Entities.EscapeMode.extended); resultD.outputSettings().prettyPrint(false); // Select only the content, removing the web header String result = resultD.getElementsByTag("table").last() .getElementsByTag("tr").get(1) .getElementsByTag("td").get(1) .html(); // Removing the -End- and -Read-More- tags created by fccms.psdr3.org result = result.replaceFirst("<div.+-End-.+<\\/div>", ""); result = result.replaceFirst("<div.+-Read-More-.+<\\/div>", ""); // Overriding the text size. Hard coded "15" can be changed as the scalar quantity. int fontScale = (int) (15 * Resources.getSystem().getConfiguration().fontScale); result = result.replaceAll("font-size:\\d+pt;", "font-size:" + fontScale + "px;"); // Add an extra line to the HTML to make the content pad well at the bottom of the WebView result = result.concat("<br>"); return result; }
/** * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2. */ private void disableJsoupHtmlEntityEscape() { if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) { Entities.EscapeMode.base.getMap().clear(); Entities.EscapeMode.extended.getMap().clear(); INITED = true; } }
/** * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist * * @param fragment the specified String * @param whitelistTags the specified whitelist tags * * @return cleaned String with allowed tags */ public static String stripHtml(String fragment, String... whitelistTags) { // Parse out html tags except those from a given list of whitelist tags Document dirty = Jsoup.parseBodyFragment(fragment); Whitelist whitelist = new Whitelist(); for (String whitelistTag : whitelistTags) { // Get the actual tag name from the whitelist tag // this is vulnerable in general to complex tags but will suffice for our simple needs whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]"); // Add all specified tags to the whitelist while preserving inline css whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class"); } Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); // Set character encoding to UTF-8 and make sure no line-breaks are added clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false); // return 'cleaned' html body return clean.body().html(); }
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() { // extended entities need a ; at the end to match, base does not String html = "& " ® &icy &hopf и 𝕙"; Document doc = Jsoup.parse(html); doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html()); }
@Override public void makeURL() { String query=super.getArtist()+" - "+super.getSong(); String searchURL=""; try { URI uri=new URI("http","www.karaoketexty.cz","/search","q="+super.getSong(),null); searchURL=uri.toASCIIString().replace("&","%26"); } catch(URISyntaxException use) { System.err.println(use); } try { Document doc=Jsoup.connect(searchURL).get(); Elements links=doc.select("#search > ul.title > li > a"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); for(Element link:links) { String resultText=Library.replacing(link.text()); if(resultText.equalsIgnoreCase(query)) { super.setURL("http://www.karaoketexty.cz"+link.attr("href")); return; } else if(resultText.contains(query)) { super.setURL("http://www.karaoketexty.cz"+link.attr("href")); return; } } super.setURL("http://www.karaoketexty.cz/search?q="+query); } catch(IOException ioe) { System.err.println(ioe); } }
@Override public String parsing() { String output=""; try { Document doc=Jsoup.connect(super.getURL()).get(); Elements lyr=doc.select("#lyrics-body-text"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); output=lyr.html().replace("<p class=\"verse\">",""); output=output.replace("</p>","<br/><br/>"); output=Library.replacing(output); } catch(IOException ioe) { System.err.println(ioe); } if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; } return output; }
/** * Obtains information. * @param method Method which is needed to call. * @param info Information which is needed to obtain. * @return Information. */ public String obtainInformation(String method,String info) { String url=createAPIrequestURL(method); String output=""; try { Document doc=Jsoup.connect(url).get(); Elements lyr=doc.select(info); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); output=lyr.first().html(); output=Library.replacing(output); } catch(IOException ioe) { System.err.println(ioe); } return output; }
@Override public String body() { final Document html = Jsoup.parse(super.body()); html.outputSettings().syntax(Document.OutputSettings.Syntax.xml); html.outputSettings().escapeMode(Entities.EscapeMode.xhtml); return html.html(); }
@Override public String processHtml(String source) { org.jsoup.nodes.Document document = Jsoup.parse(source); processHtmlDocument(document); document.outputSettings() .syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml) .prettyPrint(false) .escapeMode(Entities.EscapeMode.xhtml); return document.html(); }
private static void createTestcaseFiles() throws IOException { File srcDir = new File(RGAA3_TESTCASE_PATH); for (File file : srcDir.listFiles()) { String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", ""); String theme = fileName.substring(0, 2); String crit = fileName.substring(2, 4); String test = fileName.substring(4, 6); String testKey = Integer.valueOf(theme).toString()+"-"+Integer.valueOf(crit).toString()+"-"+Integer.valueOf(test).toString(); String wrongKey = theme+"."+crit+"."+test; for (File testcase : file.listFiles()) { if (testcase.isFile() && testcase.getName().contains("html")) { Document doc = Jsoup.parse(FileUtils.readFileToString(testcase)); Element detail = doc.select(".test-detail").first(); if (detail == null) { System.out.println(doc.outerHtml()); } else { detail.tagName("div"); detail.text(""); for (Element el : detail.children()) { el.remove(); } if (!detail.hasAttr("lang")) { detail.attr("lang", "fr"); } detail.append("\n"+RGAA3.get(testKey).ruleRawHtml+"\n"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().outline(false); doc.outputSettings().indentAmount(4); String outputHtml = doc.outerHtml(); if (outputHtml.contains(wrongKey)) { outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot()); } FileUtils.writeStringToFile(testcase, outputHtml); } } } } }
@Override public void run() { dirtyHTML = removeBadNamespaceDefinition(dirtyHTML); Document doc = Jsoup.parse(dirtyHTML); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().outline(true); doc.outputSettings().indentAmount(2); removeComments(doc); removeMalformedAttributes(doc); result = doc.outerHtml(); }
@Override public String process(String html) { // Parse str into a Document Document doc = Jsoup.parseBodyFragment(html); doc.select("nav").remove(); doc.select("div#pdfurl").remove(); // white list to clean html Whitelist wl = Whitelist.relaxed(); wl.addTags("div", "span", "p", "h1", "h2", "h3", "ul", "ol", "li", "a", "img"); wl.preserveRelativeLinks(true); wl.addAttributes("img", "src"); wl.addAttributes("a", "href"); // perform cleaning Document cleaned = new Cleaner(wl).clean(doc); cleaned.outputSettings().escapeMode(Entities.EscapeMode.xhtml); // Remove empty elements Set<String> removable = new HashSet<>(Arrays.asList("div", "span", "strong", "p", "h1", "h2", "h3", "ul", "ol", "li", "a")); cleaned.select("p:matchesOwn((?is) )").remove(); // For each element in the cleaned document for (Element el : cleaned.getAllElements()) { if (el.children().isEmpty() && (!el.hasText() || el.text().replaceAll("\u00a0", "").trim().equals(""))) { // Element is empty, check if should be removed if (removable.contains(el.tagName())) el.remove(); } } // return html for display return cleaned.html(); }
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAnySorted(notCharRefCharsSorted)) return null; final char[] charRef = charRefHolder; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); charRef[0] = replacementChar; return charRef; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) { charRef[0] = (char) charval; return charRef; } else return Character.toChars(charval); } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi charRef[0] = Entities.getCharacterByName(nameRef); return charRef; } }
/** * Makes request to the given url with given request data and method. * Follows redirects (including HTML redirects). * * @param requestData Map of key value pairs for request * @param method Connection.Method - HTTP method * @param url url as string * @throws IOException if there is an error connecting to the url */ private void makeJsoupRequest(Map<String, String> requestData, Connection.Method method, String url) throws IOException, URISyntaxException { Connection.Response response = Jsoup.connect(url) .data(requestData) .cookies(cookies) .referrer(previousUrl) // some websites block without referrer .userAgent(Config.BROWSER_USER_AGENT) // set explicit user agent .method(method) .timeout(Config.SCRAPER_TIMEOUT) .followRedirects(false) .execute(); // get cookies from response and add to all cookies addNewCookies(response.cookies()); // get content from response document = response.parse(); document.outputSettings().escapeMode(Entities.EscapeMode.xhtml); document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); document.select("script").remove(); document.select("td:contains(aktuellen ECTS-Grades)").remove(); // remove invalid html (see error #71) // check for location redirect String location = response.header("location"); if (location != null) { baseUri = new URL(location).toURI(); makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, location); } // check for meta refresh tag Element meta = document.select("meta[http-equiv=Refresh").first(); if (meta != null) { String content = meta.attr("content"); if (content != null) { meta.attr("refresh-url", content.replaceAll("(?i)^(\\d+;.*URL=)(.+)$", "$2")); makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, meta.absUrl("refresh-url")); } } // check for refresh pseudo header String refreshHeader = response.header("refresh"); if (refreshHeader != null) { String relativeUrl = refreshHeader.replaceAll("(?i)^(\\d+;.*URL=)(.+)$", "$2"); String redirectUrl = StringUtil.resolve(document.baseUri(), relativeUrl); makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, redirectUrl); } }
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAny('\t', '\n', '\r', '\f', ' ', '<', '&')) return null; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); return new char[]{replacementChar}; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors return Character.toChars(charval); } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi return new char[]{Entities.getCharacterByName(nameRef)}; } }
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAnySorted(notCharRefCharsSorted)) return null; final int[] codeRef = codepointHolder; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException ignored) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); codeRef[0] = replacementChar; return codeRef; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors codeRef[0] = charval; return codeRef; } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int numChars = Entities.codepointsForName(nameRef, multipointHolder); if (numChars == 1) { codeRef[0] = multipointHolder[0]; return codeRef; } else if (numChars ==2) { return multipointHolder; } else { Validate.fail("Unexpected characters returned for " + nameRef); return multipointHolder; } } }