/** * Fetches captions/transcript for a given video * @param videoID to fetch * @param lang this captions should be in * @throws IOException */ public void getAndSaveTranscript(String videoID, String lang) throws IOException { lang = LanguageCode.convertIso2toIso1(lang); String url = captionEndPoint+"lang="+lang+"&v="+videoID; GetMethod get = new GetMethod(url); this.client.executeMethod(get); String xmlData = get.getResponseBodyAsString(); //parse XML Document doc = Jsoup.parse(xmlData, "", Parser.xmlParser()); String allCaps = ""; for (Element e : doc.select("text")) { allCaps += e.text(); } FileSaver file = new FileSaver(allCaps, lang, "youtube_caps", url, videoID); file.save(logDb); }
/** * Checks if a given video has captions in our target language. As identified by the user who entered them * @param videoID to check * @param lang target * @return true if there are captions in lang * @throws IOException */ public boolean videoHasCaptionsInLanguage(String videoID, String lang) throws IOException { //visit captions index GetMethod get = new GetMethod(captionsIndex+videoID); this.client.executeMethod(get); String xmlData = get.getResponseBodyAsString(); //parse XML Document doc = Jsoup.parse(xmlData, "", Parser.xmlParser()); //iterate over all captions for (Element e : doc.select("track")) { String langCode = e.attr("lang_code"); String fixedLangCode = LanguageCode.convertIso1toIso2(langCode); if(fixedLangCode.equals(lang)) return true; } return false; }
/** * Parse a given HTML template and return the a result object containing the expressions * and a transformed HTML. * @param htmlTemplate The HTML template to process, as a String * @param context Context of the Component we are currently processing * @return A {@link TemplateParserResult} containing the processed template and expressions */ public TemplateParserResult parseHtmlTemplate(String htmlTemplate, TemplateParserContext context) { this.context = context; Parser parser = Parser.htmlParser(); parser.settings(new ParseSettings(true, true)); // tag, attribute preserve case Document doc = parser.parseInput(htmlTemplate, ""); result = new TemplateParserResult(); processImports(doc); processNode(doc); result.setProcessedTemplate(doc.body().html()); return result; }
@Override protected ArrayList<News> doInBackground(Void ...voids) { final ArrayList<News> newsList = new ArrayList<News>(); try { final String endpoint = BASE_URL + NEWS_ENDPOINT; final Document doc = Jsoup.connect(endpoint).parser(Parser.xmlParser()).get(); final Elements elements = doc.select("item"); for (final Element element : elements) { final News news = new News(); news.title = element.select("title").text(); news.description = element.select("description").text(); news.thumbnail = element.select("media|thumbnail").text(); news.pubDate = element.select("pubDate").text(); news.link = element.select("link").text(); newsList.add(news); } } catch (IOException e){ Log.e(TAG, "FetchNews error", e); } return newsList; }
public String removeHtmlTags(String markup) { String clean = preMatcher.matcher(markup).replaceAll(" "); clean = sourceMatcher.matcher(clean).replaceAll(" "); clean = syntaxMatcher.matcher(clean).replaceAll(" "); clean = commentMatcher.matcher(clean).replaceAll(" "); clean = monoTagMatcher.matcher(clean).replaceAll(" "); clean = fakeTagMatcher.matcher(clean).replaceAll(" "); try { Document document = Jsoup.parse(clean, "", Parser.xmlParser()); document.select("math, gallery, ref, br, ins, s, del, tt, blockqoute, table").html(" "); clean = document.text(); } catch(Exception e) { e.printStackTrace(); } return clean; }
private AudioTrack loadTrack(String videoId) { checkLoggedIn(); try (HttpInterface httpInterface = getHttpInterface()) { try (CloseableHttpResponse response = httpInterface.execute(new HttpGet("http://ext.nicovideo.jp/api/getthumbinfo/" + videoId))) { int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != 200) { throw new IOException("Unexpected response code from video info: " + statusCode); } Document document = Jsoup.parse(response.getEntity().getContent(), StandardCharsets.UTF_8.name(), "", Parser.xmlParser()); return extractTrackFromXml(videoId, document); } } catch (IOException e) { throw new FriendlyException("Error occurred when extracting video info.", SUSPICIOUS, e); } }
@Override public List<String> getAllClasses() throws IOException, JSONException, CredentialInvalidException { String baseurl = data.getString(PARAM_BASEURL) + "/"; for (int i = -4; i < MAX_DAYS; i++) { LocalDate date = LocalDate.now().plusDays(i); String dateStr = DateTimeFormat.forPattern("yyyyMMdd").print(date); String url = baseurl + "mobdaten/PlanKl" + dateStr + ".xml?_=" + System.currentTimeMillis(); try { String xml = httpGet(url, "UTF-8"); Document doc = Jsoup.parse(xml, url, Parser.xmlParser()); List<String> classes = new ArrayList<>(); for (Element klasse:doc.select("Klassen > Kl")) { classes.add(klasse.select("Kurz").first().text()); } return classes; } catch (HttpResponseException e) { if (e.getStatusCode() != 404 && e.getStatusCode() != 300) throw e; } } return new ArrayList<>(); }
@Test public void wrongMetaCharsetFallback() { try { final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8"); final ByteBuffer inBuffer = ByteBuffer.wrap(input); Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser()); final String expected = "<html>\n" + " <head>\n" + " <meta charset=\"iso-8\">\n" + " </head>\n" + " <body></body>\n" + "</html>"; assertEquals(expected, doc.toString()); } catch( UnsupportedEncodingException ex ) { fail(ex.getMessage()); } }
public static void normalXmlParse(){ String json = CrawlerPack.getFromRemote(url); String xml = CrawlerPack.jsonToXml(json); // 原始 json 轉為 xml 的結果 System.out.println( "原始XML" ) ; System.out.println( xml ); Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser()); jsoupDoc.charset(StandardCharsets.UTF_8); // 發生了什麼事? System.out.println( "轉換後XML" ) ; System.out.println(jsoupDoc.toString()); }
@Override protected String doInBackground(String... strings) { Document opmlDocument = null; try { if (mUrl != null) { opmlDocument = Jsoup.connect(mUrl).parser(Parser.xmlParser()).get(); } else { opmlDocument = Jsoup.parse(mFile, "UTF-8"); } } catch (IOException e) { e.printStackTrace(); return e.getMessage(); } if (opmlDocument != null) { mOpmlItems = opmlDocument.select("outline"); } return "success"; }
public static Document getDocument(String filepath) { Document doc = null; InputStream is = null; try { is = new FileInputStream(filepath); } catch (FileNotFoundException e1) { // TODO Auto-generated catch block System.out.println("FileUtils: no such XML file path exists"); e1.printStackTrace(); return null; } try { doc = Jsoup.parse(is, "UTF-8", "", Parser.xmlParser()); } catch (Exception e) { System.out.println("Parse file to XML Document error!"); e.printStackTrace(); } return doc; }
private static String getBranchRegex(String jobXml) { try { String branchRegex = Jsoup.parse(jobXml, "", Parser.xmlParser()) .getElementsByTag("gerritProjects").get(0) .getElementsByTag(GERRITPROJECT_TAG).get(0) .getElementsByTag("branches").get(0) .getElementsByTag(BRANCH_TAG).get(0) .getElementsByTag("pattern").get(0).html(); // Remove "^" and "$" at the beginning and the end, respectively branchRegex = branchRegex.substring(1, branchRegex.length() - 1); // Remove sections of regex that we add post-user-input branchRegex = branchRegex.replace("(?!refs/meta/)", ""); branchRegex = branchRegex.replace("(?!refs/)", "refs/heads/"); return branchRegex; } catch (IndexOutOfBoundsException e) { return null; } }
private String parse(String str) { Document document = Jsoup.parse(str, "", Parser.xmlParser()); String result = ""; switch (outType) { case TYPE_TEXT: result = document.text(); break; case TYPE_HTML: result = document.html(); break; default: result = document.text(); break; } return result; }
/** * Read information about user. Here you can read other important info. * @throws Exception */ private void getUserInfo() throws Exception { //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket= //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=xybt9orxzo1xrr5vk4r0axne804y1tpk NULogger.getLogger().log(Level.INFO, "{0} Getting auth token value............", getClass()); httpGet = new NUHttpGet("https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=" + ticket); httpResponse = httpclient.execute(httpGet, httpContext); responseString = EntityUtils.toString(httpResponse.getEntity()); //NULogger.getLogger().log(Level.INFO, "{0}Response : {1}", new Object[]{getClass(), stringResponse}); doc = Jsoup.parse(responseString, "", Parser.xmlParser()); String auth_token = doc.select("response auth_token").text(); NULogger.getLogger().log(Level.INFO, "{0} Auth_token : {1}", new Object[]{getClass(), auth_token}); properties().setEncryptedProperty(KEY_AUTH_TOKEN, auth_token); }
@Test public void handles200WithNoContent() throws IOException { Connection con = Jsoup .connect("http://direct.infohound.net/tools/200-no-content.pl") .userAgent(browserUa); Connection.Response res = con.execute(); Document doc = res.parse(); assertEquals(200, res.statusCode()); con = Jsoup .connect("http://direct.infohound.net/tools/200-no-content.pl") .parser(Parser.xmlParser()) .userAgent(browserUa); res = con.execute(); doc = res.parse(); assertEquals(200, res.statusCode()); }
static public YoudaoResult getDefinition(String key) throws IOException{ Document doc = Jsoup.connect(String.format(BASE_URL, key.trim())) .userAgent("Mozilla") .cookie("auth", "token") .timeout(2000) .parser(Parser.xmlParser()) .get(); //doc.toString(); String phonetic = getSingleQueryResult(doc, "phonetic-symbol"); String returnPhrase = getSingleQueryResult(doc, "return-phrase"); List<String> translation = new ArrayList<String>(); for(Element e : doc.select("translation > content")){ translation.add(e.text()); } Map<String, List<String>> webTranslation = new LinkedHashMap<>(); for(Element web : doc.select("web-translation")){ String keyString = getSingleQueryResult(web, "key"); List<String> values = new ArrayList<>(); for(Element value : web.select("trans > value")){ String valueString = value.text().trim(); values.add(valueString); } webTranslation.put(keyString, values); } YoudaoResult youdaoResult = new YoudaoResult(); youdaoResult.phonetic = phonetic; youdaoResult.returnPhrase = returnPhrase; youdaoResult.translation = translation; youdaoResult.webTranslation = webTranslation; return youdaoResult; }
public void action(JSONObject data) throws EventException { final JSONObject article = data.optJSONObject(Article.ARTICLE); String content = article.optString(Article.ARTICLE_CONTENT); final Document doc = Jsoup.parse(content, StringUtils.EMPTY, Parser.htmlParser()); doc.outputSettings().prettyPrint(false); final StringBuilder listBuilder = new StringBuilder(); listBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + Latkes.getStaticServePath() + "/plugins/list/style.css\" />"); final Elements hs = doc.select("h1, h2, h3, h4, h5"); listBuilder.append("<ul class='b3-solo-list'>"); for (int i = 0; i < hs.size(); i++) { final Element element = hs.get(i); final String tagName = element.tagName().toLowerCase(); final String text = element.text(); final String id = "b3_solo_" + tagName + "_" + i; element.before("<span id='" + id + "'></span>"); listBuilder.append("<li class='b3-solo-list-").append(tagName).append("'><a href='#").append(id) .append("'>").append(text).append("</a></li>"); } listBuilder.append("</ul>"); final Element body = doc.getElementsByTag("body").get(0); content = listBuilder.toString() + body.html(); article.put(Article.ARTICLE_CONTENT, content); }
@Override protected IGoogleImageSearchResult parseResult(String response) { Document parsedPage = Parser.parse(response, link.toString()); final Elements body = parsedPage.body().children(); return new GoogleImageSearchResult.Builder() .addBestGuess(retrieveBestGuessFromHTML(body)) .addLinks(retrieveLinksFromHTML(body)) .addDescriptions(retrieveDescriptionFromHTML(body)) .addTitles(retrieveTitleFromHTML(body)) .addSimilarImages(retrieveSimilarImageFromHTML(body)) .build(); }
public String findSymbolInCertificate(String certificate) { Elements els = Jsoup.parse(certificate.replaceAll(":", ""), "", Parser.xmlParser()) .select("[AttributeName=\"UserInstance\"] samlAttributeValue"); if (els.isEmpty()) { return ""; } return els.get(1).text(); }
@Override protected String doInBackground(String... params) { System.setProperty("http.agent", "Chrome"); try { Document doc = Jsoup.parse(new URL(params[0]).openStream(), "UTF-8", "", Parser.xmlParser()); rawData = PartPickerScraper.getRawData(doc); urls = PartPickerScraper.getUrlsFromDoc(doc); } catch (IOException e) { e.printStackTrace(); } return null; }
private String getFragmentContent(String content, Document scriptContentDocument) { Document resultDocument = Jsoup.parse(content, "UTF-8", Parser.xmlParser()); Element scriptTag = resultDocument.child(0).empty(); scriptContentDocument.childNodesCopy().forEach(scriptTag::appendChild); return resultDocument.html(); }
private String clean(String text) { String cleanText = text.replace("\n", "").replaceAll(">(\\s)+<", "><") .replaceAll(">(\\s)+\\{", ">{").replaceAll("\\}(\\s)+<", "}<"); return Jsoup.parse(cleanText, "UTF-8", Parser.xmlParser()) .outputSettings(OUTPUT_SETTINGS) .html() .trim(); }
private List<YoutubeTrackFormat> loadTrackFormatsFromDash(String dashUrl, HttpInterface httpInterface, String playerScript) throws Exception { String resolvedDashUrl = sourceManager.getCipherManager().getValidDashUrl(httpInterface, playerScript, dashUrl); try (CloseableHttpResponse response = httpInterface.execute(new HttpGet(resolvedDashUrl))) { int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != 200) { throw new IOException("Invalid status code for track info page response: " + statusCode); } Document document = Jsoup.parse(response.getEntity().getContent(), CHARSET, "", Parser.xmlParser()); return loadTrackFormatsFromDashDocument(document); } }
/** * Cleans some html text by stripping all tags but <code>br</code> and then * unescapes named entitiesl like '"e';. brs will be replaced by * newlines. * * @param htmlText * @return */ String htmlTextToPlainText(final String htmlText) { final Whitelist whitelist = Whitelist.none(); whitelist.addTags("br"); final Cleaner cleaner = new Cleaner(whitelist); final Document cleanedDocument = cleaner.clean(Jsoup.parse(htmlText)); cleanedDocument .outputSettings() .prettyPrint(false) .escapeMode(EscapeMode.xhtml) .charset(StandardCharsets.UTF_8); return Parser.unescapeEntities(cleanedDocument.body().html().trim(), true).replaceAll("<br(?: ?/)?>", "\r\n"); }
@Override public String findParentFile(String xml) { String ret = null; Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); for (Element e : doc.select("resources")) { ret = e.select("resource").get(0).attr("href"); } return ret; }
private static Document parse(String html) throws IOException { Parser parser = Parser.htmlParser(); Document doc = Jsoup.parse(new ByteArrayInputStream(html.getBytes(UTF_8)), null, "", parser); doc.outputSettings().indentAmount(0); doc.outputSettings().prettyPrint(false); return doc; }
@Override public void getNext(final JCas jcas, InputStream file, Drama drama) throws IOException, CollectionException { Document doc = Jsoup.parse(file, "UTF-8", "", Parser.xmlParser()); Visitor vis = new Visitor(jcas); Element root = doc.select("TEI > text").first(); root.traverse(vis); vis.getJCas(); select2Annotation(jcas, root, vis.getAnnotationMap(), "speaker", Speaker.class, null); select2Annotation(jcas, root, vis.getAnnotationMap(), "stage", StageDirection.class, null); select2Annotation(jcas, root, vis.getAnnotationMap(), "sp", Utterance.class, null, new Select2AnnotationCallback<Utterance>() { @Override public void call(Utterance annotation, Element xmlElement) { Collection<Speaker> speakers = JCasUtil.selectCovered(Speaker.class, annotation); for (Speaker sp : speakers) { String[] whos = xmlElement.attr("who").split(" "); sp.setXmlId(new StringArray(jcas, whos.length)); for (int i = 0; i < whos.length; i++) sp.setXmlId(i, whos[i]); } } }); select2Annotation(jcas, root, vis.getAnnotationMap(), "l", Speech.class, null); readActsAndScenes(jcas, root, vis.getAnnotationMap(), true); readCast(jcas, drama, doc); AnnotationUtil.trim(new ArrayList<Figure>(JCasUtil.select(jcas, Figure.class))); AnnotationUtil.trim(new ArrayList<Speech>(JCasUtil.select(jcas, Speech.class))); AnnotationUtil.trim(new ArrayList<Utterance>(JCasUtil.select(jcas, Utterance.class))); AnnotationUtil.trim(new ArrayList<Scene>(JCasUtil.select(jcas, Scene.class))); AnnotationUtil.trim(new ArrayList<Act>(JCasUtil.select(jcas, Act.class))); }
@Override public void getNext(JCas jcas, InputStream file, Drama drama) throws IOException, CollectionException { getLogger().log(Level.INFO, "Now parsing XML document"); Document doc = Jsoup.parse(file, "UTF-8", "", Parser.xmlParser()); Visitor vis = new FolgerVisitor(jcas); Element root = doc.select("TEI > text > body").first(); getLogger().log(Level.INFO, "Traversing XML nodes"); root.traverse(vis); jcas = vis.getJCas(); getLogger().log(Level.INFO, "Finished Traversing"); }
@NotNull static AdditionalInfo handleXML(String xml) { AdditionalInfo info = new AdditionalInfo(); info.setTitle(TITLE); Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); String text = doc.select("item description").first().text(); if (text.equals("Zurzeit gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")) { info.setHasInformation(false); } info.setTitle(TITLE + " (Stand: " + doc.select("pubDate").first().text() + ")"); info.setText(text); return info; }
void parseIndiwarePage(SubstitutionSchedule v, String response) throws JSONException, IOException { boolean html; Element doc; if (response.contains("<html") || response.contains("<table")) { html = true; doc = Jsoup.parse(response); } else { html = false; doc = Jsoup.parse(response, "", Parser.xmlParser()); } if (html && data.has(PARAM_EMBEDDED_CONTENT_SELECTOR)) { String selector = data.getString(PARAM_EMBEDDED_CONTENT_SELECTOR); Elements elems = doc.select(selector); if (elems.size() == 0) throw new IOException("No elements found using " + selector); for (Element elem : elems) { v.addDay(parseIndiwareDay(elem, true)); } } else if (html && doc.select(".vpfuer").size() > 1) { // multiple schedules after each other on one page String[] htmls = doc.html().split("<span class=\"vpfuer\">"); for (int i = 1; i < htmls.length; i++) { Document splitDoc = Jsoup.parse(htmls[i]); v.addDay(parseIndiwareDay(splitDoc, true)); } } else { v.addDay(parseIndiwareDay(doc, html)); } }
@Test public void demoTest() throws IOException, JSONException { Document doc = Jsoup.parse(readResource("/indiware-mobile/indiware-mobile.xml"), "", Parser.xmlParser()); SubstitutionScheduleDay day = IndiwareMobileParser.parseDay(doc, new ColorProvider()); assertEquals(new LocalDate(2017, 6, 21), day.getDate()); assertEquals(new LocalDateTime(2017, 6, 20, 10, 28), day.getLastChange()); assertEquals(192, day.getSubstitutions().size()); }
@Test public void testEquals() throws IOException, JSONException { SubstitutionScheduleDay scheduleXML = parser.parseIndiwareDay(Jsoup.parse(xml, "", Parser.xmlParser()), false); SubstitutionScheduleDay scheduleHTML = parser.parseIndiwareDay(Jsoup.parse(html), true); assertEquals(scheduleXML, scheduleHTML); }
private Request() { timeoutMilliseconds = 3000; maxBodySizeBytes = 1024 * 1024; // 1MB followRedirects = true; data = new ArrayList<Connection.KeyVal>(); method = Method.GET; headers.put("Accept-Encoding", "gzip"); parser = Parser.htmlParser(); }
/** * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children. * @param html HTML to add inside this element, after the existing HTML * @return this element * @see #html(String) */ public Element append(String html) { Validate.notNull(html); List<Node> nodes = Parser.parseFragment(html, this, baseUri()); addChildren(nodes.toArray(new Node[nodes.size()])); return this; }
/** * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children. * @param html HTML to add inside this element, before the existing HTML * @return this element * @see #html(String) */ public Element prepend(String html) { Validate.notNull(html); List<Node> nodes = Parser.parseFragment(html, this, baseUri()); addChildren(0, nodes.toArray(new Node[nodes.size()])); return this; }
private void addSiblingHtml(int index, String html) { Validate.notNull(html); Validate.notNull(parentNode); Element context = parent() instanceof Element ? (Element) parent() : null; List<Node> nodes = Parser.parseFragment(html, context, baseUri()); parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); }
@Test public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException { // should auto-detect xml and use XML parser, unless explicitly requested the html parser String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml"; Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()); Document doc = con.get(); Connection.Request req = con.request(); assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder); assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); }