Java Examples for org.jsoup.nodes.Document
The following java examples will help you to understand the usage of org.jsoup.nodes.Document. These source code samples are taken from different open source projects.
Example 1
| Project: SlideshowFX-master File: DOMUtils.java View source code |
public static void saveDocument(Document document, File file) {
String result = null;
document.outputSettings().prettyPrint(true);
try (final Writer output = new DefaultCharsetWriter(file)) {
output.write(document.outerHtml());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}Example 2
| Project: lavender-master File: RamdomImgParser.java View source code |
public static String parserImg(String html) {
Document document = Jsoup.parse(html);
Elements divs = document.select("div");
for (Element div : divs) {
if (!div.attr("id").equals("photo-detail-wrapper")) {
continue;
}
return div.select("img").first().attr("src");
}
return null;
}Example 3
| Project: sagan-master File: ReferenceDocumentSearchEntryMapper.java View source code |
@Override
public ReferenceDoc map(Document document) {
ReferenceDoc entry = new ReferenceDoc();
String text = document.text();
entry.setRawContent(text);
entry.setSummary(text.substring(0, Math.min(500, text.length())));
entry.setTitle(document.title());
entry.setSubTitle(String.format("%s (%s Reference)", project.getName(), version.getVersion()));
entry.setPath(document.baseUri());
entry.setCurrent(version.isCurrent());
entry.setProjectId(project.getId());
entry.setVersion(version.getVersion());
entry.addFacetPaths("Projects", "Projects/Reference", "Projects/" + project.getName(), "Projects/" + project.getName() + "/" + version.getVersion());
return entry;
}Example 4
| Project: Android-Studio-Project-master File: ContentParser.java View source code |
public static Content Parser(String html) {
Document doc = Jsoup.parse(html);
Elements links = doc.select("img[src~=(?i)\\.(png|jpe?g)]");
Content content = new Content();
Element element = links.get(1).getElementsByTag("img").first();
content.setUrl(element.attr("src"));
content.setTitle(element.attr("alt"));
return content;
}Example 5
| Project: coolreader-master File: DownloadPageTask.java View source code |
@Override protected AsyncTaskResult<Document> doInBackground(URL... arg0) { try { Log.d("DownloadPageTask", "Downloading: " + arg0[0].toString()); Response response = Jsoup.connect(arg0[0].toString()).timeout(7000).execute(); Log.d("DownloadPageTask", "Complete: " + arg0[0].toString()); return new AsyncTaskResult<Document>(response.parse()); } catch (Exception e) { return new AsyncTaskResult<Document>(e); } }
Example 6
| Project: jinjava-master File: GroupByFilterTest.java View source code |
@Test
public void testGroupByAttr() throws Exception {
Document dom = Jsoup.parseBodyFragment(jinjava.render(Resources.toString(Resources.getResource("filter/groupby-attr.jinja"), StandardCharsets.UTF_8), ImmutableMap.of("persons", (Object) Lists.newArrayList(new Person("male", "jared", "stehler"), new Person("male", "foo", "bar"), new Person("female", "sarah", "jones"), new Person("male", "jim", "jones"), new Person("female", "barb", "smith")))));
assertThat(dom.select("ul.root > li")).hasSize(2);
assertThat(dom.select("ul.root > li.male > ul > li")).hasSize(3);
assertThat(dom.select("ul.root > li.female > ul > li")).hasSize(2);
}Example 7
| Project: jooby-master File: Issue624d.java View source code |
@Test
public void shouldForceARedirect() throws Exception {
request().get("/saved-url").expect( rsp -> {
Document html = Jsoup.parse(rsp);
String action = (html.select("form").attr("action"));
assertEquals("/auth?client_name=FormClient", action);
});
request().get("/auth?username=test&password=test").expect("/saved-url");
}Example 8
| Project: jsoup-master File: W3CDom.java View source code |
/**
* Convert a jsoup Document to a W3C Document.
* @param in jsoup doc
* @return w3c doc
*/
public Document fromJsoup(org.jsoup.nodes.Document in) {
Validate.notNull(in);
DocumentBuilder builder;
try {
//set the factory to be namespace-aware
factory.setNamespaceAware(true);
builder = factory.newDocumentBuilder();
Document out = builder.newDocument();
convert(in, out);
return out;
} catch (ParserConfigurationException e) {
throw new IllegalStateException(e);
}
}Example 9
| Project: LNReader-Android-master File: DownloadPageTask.java View source code |
@Override protected AsyncTaskResult<Document> doInBackground(URL... arg0) { try { Log.d("DownloadPageTask", "Downloading: " + arg0[0].toString()); Response response = Jsoup.connect(arg0[0].toString()).timeout(7000).execute(); Log.d("DownloadPageTask", "Complete: " + arg0[0].toString()); return new AsyncTaskResult<Document>(response.parse(), Document.class); } catch (Exception e) { return new AsyncTaskResult<Document>(null, Document.class, e); } }
Example 10
| Project: moulder-j-master File: TexterTest.java View source code |
@Test
public void testRegularText() throws Exception {
Value<String> text = mock(Value.class);
when(text.get()).thenReturn("text");
Texter a = new Texter(text);
Document document = Jsoup.parseBodyFragment("<html><body><outer>test</outer></body></html>");
Element element = document.getElementsByTag("outer").first();
List<Node> processed = a.process(element);
// verify that bind and get were called, in this order
InOrder inOrder = inOrder(text);
inOrder.verify(text).get();
assertXMLEqual(new StringReader("<body><outer>text</outer></body>"), new StringReader(html(processed)));
}Example 11
| Project: muzima-android-master File: HTMLConceptParser.java View source code |
public List<String> parse(String html) {
Set<String> concepts = new HashSet<String>();
Document htmlDoc = Jsoup.parse(html);
//Select all elements containing data-concept attr and is not a div.
Elements elements = htmlDoc.select("*:not(div)[" + DATA_CONCEPT_TAG + "]");
for (Element element : elements) {
concepts.add(getConceptName(element.attr(DATA_CONCEPT_TAG)));
}
return new ArrayList<String>(concepts);
}Example 12
| Project: NiceText-master File: NTImpl.java View source code |
public String extract(String url) {
String t = null;
try {
Connection connection = Jsoup.connect(url).userAgent(Constants.USER_AGENT).header("Accept", "text/html,application/xhtml+xml,application/xml").header("Accept-Encoding", "gzip,deflate,sdch").followRedirects(true).timeout(Constants.CONN_TIMEOUT);
Connection.Response response = connection.execute();
Document document = response.parse();
t = extract(document);
} catch (IOException e) {
e.printStackTrace();
}
return t;
}Example 13
| Project: pictorial_android_client-master File: ParserImageList.java View source code |
public static ImageListBean parser(String mRet) {
ImageListBean imageListBean = new ImageListBean();
if (mRet != null) {
Document document = Jsoup.parse(mRet);
Elements elements = document.getElementsByClass("post-inner");
for (Element element : elements) {
String imgurl = element.select("a[title]").attr("href");
Element element2 = element.select("img[src]").first();
String src = element2.attr("src");
String width = element2.attr("width");
String height = element2.attr("height");
String alt = element2.attr("alt");
ImageBean imageBean = new ImageBean();
imageBean.setAlt(alt);
imageBean.setDetailurl(imgurl);
imageBean.setHeight(height);
imageBean.setWidth(width);
imageBean.setImgurl(src);
imageListBean.add(imageBean);
}
}
return imageListBean;
}Example 14
| Project: playconf-master File: IndexViewTest.java View source code |
@Override
public void run() {
Context.current.set(testHttpContext());
Proposal s = sampleProposal();
Speaker speaker = sampleSpeaker();
s.speaker = speaker;
Html html = views.html.index.render(s);
Document doc = Jsoup.parse(contentAsString(html));
assertThat(doc.select("#title").text()).isEqualTo("Keynote - " + s.title);
assertThat(doc.select("#speakerName").text()).isEqualTo(speaker.name);
}Example 15
| Project: ulti-master File: UtilsDemo.java View source code |
public static String TestJsoup() {
String html1 = "<html><head><title>First parse</title></head>" + "<body><p>Parsed HTML into a doc.</p></body></html>";
StringBuffer sb = new StringBuffer();
Document doc = Jsoup.parse(html1);
Logs.d("docs---" + doc.title() + " " + doc.getAllElements().size());
Logs.d("docs---" + doc.children().size() + " " + doc.location());
for (Element element : doc.getAllElements()) {
sb.append(element.tagName() + " " + element.nodeName() + " " + element.children().size() + " " + element.data() + " " + element.text() + "\n");
Logs.d(element.text() + " ");
}
return sb.toString();
}Example 16
| Project: UltimateAndroid-master File: UtilsDemo.java View source code |
public static String TestJsoup() {
String html1 = "<html><head><title>First parse</title></head>" + "<body><p>Parsed HTML into a doc.</p></body></html>";
StringBuffer sb = new StringBuffer();
Document doc = Jsoup.parse(html1);
Logs.d("docs---" + doc.title() + " " + doc.getAllElements().size());
Logs.d("docs---" + doc.children().size() + " " + doc.location());
for (Element element : doc.getAllElements()) {
sb.append(element.tagName() + " " + element.nodeName() + " " + element.children().size() + " " + element.data() + " " + element.text() + "\n");
Logs.d(element.text() + " ");
}
return sb.toString();
}Example 17
| Project: v2ex-android-master File: NotificationListModel.java View source code |
public void parse(String responseBody) throws Exception {
Document doc = Jsoup.parse(responseBody);
Element body = doc.body();
Elements elements = body.getElementsByAttributeValue("class", "cell");
for (Element el : elements) {
NotificationModel notification = new NotificationModel();
if (notification.parse(el))
add(notification);
}
int[] pages = ContentUtils.parsePage(body);
currentPage = pages[0];
totalPage = pages[1];
}Example 18
| Project: validadorAcessibilidade-master File: RecomendacaoTarget.java View source code |
@Override
public String executa(Document doc) {
String elementoTarget = "";
Elements targetBlank = doc.select("a[target=_blank");
for (Element element : targetBlank) {
elementoTarget += "\n" + element;
}
Elements targetNew = doc.select("a[target=_new");
for (Element element : targetNew) {
elementoTarget += "\n" + element;
}
Element execucaoAutomatica = doc.select("body").first();
String linkHref = execucaoAutomatica.attr("onload");
if (!linkHref.isEmpty()) {
elementoTarget += "\n" + linkHref;
}
return elementoTarget;
}Example 19
| Project: alfresco-apache-storm-demo-master File: JSoupDOMBuilder.java View source code |
/**
* Returns a W3C DOM that exposes the same content as the supplied Jsoup document into a W3C
* DOM.
*
* @param jsoupDocument The Jsoup document to convert.
* @return A W3C Document.
*/
public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) {
Document document = null;
try {
/* Obtain the document builder for the configured XML parser. */
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
/* Create a document to contain the content. */
document = docBuilder.newDocument();
createDOM(jsoupDocument, document, document, new HashMap<String, String>());
} catch (ParserConfigurationException pce) {
throw new RuntimeException(pce);
}
return document;
}Example 20
| Project: Android_RssReader-master File: DescriptionFormatter.java View source code |
@Override
protected String LoadFromCache(Blog blog) {
if (blog != null && blog.Description.length() == 0) {
return "";
}
Document doc = Jsoup.parse(blog.Description);
List<Element> embeds = doc.getElementsByTag("embed");
for (Element d : doc.getElementsByTag("iframe")) {
if (d.hasAttr("src") && (d.attr("src").contains("swf") || d.attr("src").contains("youku") || d.attr("src").contains("sohu") || d.attr("src").contains("tudou") || d.attr("src").contains("youtube") || d.attr("src").contains("ku6")))
embeds.add(d);
}
for (Element d : doc.getElementsByTag("a")) {
if (d.hasAttr("href") && (d.attr("href").contains("swf") || d.attr("href").contains("youku") || d.attr("href").contains("sohu") || d.attr("href").contains("tudou") || d.attr("href").contains("youtube") || d.attr("href").contains("ku6")))
embeds.add(d);
}
if (embeds.size() != 0)
return "";
for (Element img : doc.getElementsByTag("img")) {
if (img.hasAttr("src") && !img.attr("src").startsWith(prefix)) {
return "";
}
}
return blog.Description;
}Example 21
| Project: any-video-master File: PandaCrawler.java View source code |
private void savePandaLivesToRedis(Document document) {
List<VideoDTO> lives = new ArrayList<>();
Elements elements = document.select("li.video-list-item.video-no-tag");
for (Element element : elements) {
VideoDTO videoDTO = new VideoDTO();
String title = "[" + element.select("div.video-info span.video-cate").text() + "] " + element.select("div.video-info span.video-nickname").text();
String image = element.select("img.video-img").attr("data-original");
String url = PANDA + element.attr("data-id");
videoDTO.setAvailable(true);
videoDTO.setTitle(title);
videoDTO.setImage(image);
videoDTO.setValue(url);
lives.add(videoDTO);
if (lives.size() > 48) {
break;
}
}
String key = redisSourceManager.VIDEO_PREFIx_HOME_LIVE_KEY + "_" + TAG;
redisSourceManager.saveVideos(key, lives);
}Example 22
| Project: cms-ce-master File: HtmlExtractor.java View source code |
@Override
public String extractText(final String mimeType, final InputStream inputStream, final String encoding) throws IOException {
if (!canHandle(mimeType)) {
return null;
}
StringBuilder builder = new StringBuilder();
Document doc = Jsoup.parse(inputStream, encoding, "");
for (Element element : doc.getAllElements()) {
for (TextNode textNode : element.textNodes()) {
final String text = textNode.text();
builder.append(text);
appendWhitespaceAfterTextIfNotThere(builder, text);
}
}
return builder.toString();
}Example 23
| Project: deepnighttwo-master File: FirstTry.java View source code |
public static void main(String[] args) throws IOException {
Document doc = Jsoup.connect("http://www.envir.gov.cn/airnews/index.asp").data("Fdate", "2000-6-1").data("Tdate", "2000-6-8").userAgent("I'm jsoup").timeout(3000).post();
// System.out.println(doc);
Elements eles = doc.select("table[bordercolor] > tr");
eles.remove(0);
for (Element ele : eles) {
Elements rows = ele.select("td");
for (Element row : rows) {
System.out.println(row.ownText());
}
}
// Element content = doc.getElementById("content");
// Elements links = content.getElementsByTag("a");
// for (Element link : links) {
// String linkHref = link.attr("href");
// String linkText = link.text();
// System.out.println(linkHref);
// System.out.println(linkText);
// }
}Example 24
| Project: dungproxy-master File: WaitProxyTest.java View source code |
public static void main(String[] args) {
// å¼€å?¯ä»£ç?†IPæ± ,设置IPæ± ç©ºé˜»å¡žç‰å¾…
DungProxyContext dungProxyContext = DungProxyContext.create().setWaitIfNoAvailableProxy(true).setPoolEnabled(true);
IpPoolHolder.init(dungProxyContext);
for (int i = 0; i < 5; i++) {
new Thread() {
@Override
public void run() {
for (int i = 0; i < 5; i++) {
String s = HttpInvoker.get("http://ip.cn/");
if (StringUtils.isEmpty(s)) {
continue;
}
Document parse = Jsoup.parse(s);
System.out.println(parse.select("#result").text());
}
}
}.start();
}
for (int i = 0; i < 10; i++) {
String s = HttpInvoker.get("http://ip.cn/");
if (StringUtils.isEmpty(s)) {
continue;
}
Document parse = Jsoup.parse(s);
System.out.println(parse.select("#result").text());
}
}Example 25
| Project: EhViewer-master File: ProfileParser.java View source code |
public static Result parse(String body) throws ParseException {
try {
Result result = new Result();
Document d = Jsoup.parse(body);
Element profilename = d.getElementById("profilename");
result.displayName = profilename.child(0).text();
try {
result.avatar = profilename.nextElementSibling().nextElementSibling().child(0).attr("src");
if (TextUtils.isEmpty(result.avatar)) {
result.avatar = null;
} else if (!result.avatar.startsWith("http")) {
result.avatar = EhUrl.URL_FORUMS + result.avatar;
}
} catch (Exception e) {
Log.i(TAG, "No avatar");
}
return result;
} catch (Exception e) {
throw new ParseException("Parse forums error", body);
}
}Example 26
| Project: email-master File: UriParserTestHelper.java View source code |
public static void assertContainsLink(String expected, StringBuffer actual) {
String linkifiedUri = actual.toString();
Document document = Jsoup.parseBodyFragment(linkifiedUri);
Element anchorElement = document.select("a").first();
assertNotNull("No <a> element found", anchorElement);
assertEquals(expected, anchorElement.text());
assertEquals(expected, anchorElement.attr("href"));
}Example 27
| Project: example-webapp-master File: ExceptionHandlingIntegrationTests.java View source code |
@Test
public void shouldSeeErrorReferenceDisplayedOnThePage() throws Exception {
SpringDispatcherServlet servlet = SpringDispatcherServlet.create();
MockHttpServletResponse response = servlet.process(new MockHttpServletRequest("GET", "/bad"));
String redirectedUrl = response.getRedirectedUrl();
assertThat(redirectedUrl, matchesPattern(sequence("/error/", exactly(7, anyCharacterIn("A-Z0-9")))));
String errorRef = StringUtils.substringAfterLast(redirectedUrl, "/");
response = servlet.process(new MockHttpServletRequest("GET", redirectedUrl));
String html = response.getContentAsString();
Document document = Jsoup.parse(html);
Elements elements = document.select("#errorRef");
assertThat(elements.size(), equalTo(1));
assertThat(elements.first().text(), equalTo(errorRef));
}Example 28
| Project: GoVRE-master File: ProxyNetworkTrainMapImage.java View source code |
//METHODS
private static String fetchTrainImageUrlFromVRE(Context context) {
try {
String imgUrl = "";
String url = context.getResources().getString(R.string.urlVREImgMap);
Document doc = Jsoup.connect(url).get();
//Focus on all tags with source attributes
Elements media = doc.select("[src]");
for (Element src : media) {
//Verify this is an image
if (src.tagName().equals("img")) {
imgUrl = src.attr("abs:src");
//Check if link contains the action query string, the map is the only image that will have it.
if (imgUrl.contains("app?action=getimg")) {
return imgUrl;
}
}
}
//Else Return Empty String
return "";
} catch (IOException e) {
}
return null;
}Example 29
| Project: IU-master File: ConsumeInfo.java View source code |
public static int parseHtml(List<ConsumeInfo> list, Document doc) {
if (doc == null) {
return 0;
}
if (list == null) {
list = new ArrayList<>();
}
Elements table = doc.select("table#GridView1").select("tr");
int size = table.size();
if (size < 2) {
return 0;
}
Element tr;
Elements td2;
ConsumeInfo info;
for (int i = 1; i < size - 1; i++) {
tr = table.get(i);
td2 = tr.children();
if (td2.size() != 3) {
continue;
}
info = new ConsumeInfo();
info.time = td2.get(1).text();
info.remain = td2.get(2).text();
list.add(info);
}
return table.select("a").size() + 1;
}Example 30
| Project: japicmp-master File: ITReportTitle.java View source code |
@Test
public void testReportTitle() throws IOException {
Path htmlPath = Paths.get(System.getProperty("user.dir"), "target", "site", "project-reports.html");
assertThat(Files.exists(htmlPath), is(true));
Document document = Jsoup.parse(htmlPath.toFile(), "UTF-8");
Elements leftNav = document.select("#leftColumn [href=\"japicmp.html\"]");
assertThat(leftNav.attr("title"), is("japicmp"));
assertThat(leftNav.text(), is("japicmp"));
Elements overviewRow = document.select("#bodyColumn tr:has([href=\"japicmp.html\"])");
Elements link = overviewRow.select("[href=\"japicmp.html\"]");
assertThat(link.text(), is("japicmp"));
Elements description = overviewRow.select("td:eq(1)");
String projectVersion = System.getProperty("project.version");
assertThat(description.text(), is("Comparing source compatibility of japicmp-test-v2-" + projectVersion + ".jar against japicmp-test-v1-" + projectVersion + ".jar"));
}Example 31
| Project: JianShuApp-master File: DataPool.java View source code |
private Object[] load(String url) throws IOException, LoginRequiredException {
Object httpResult = JianshuSession.getsInstance().getSync(url, true);
if (httpResult instanceof String) {
Document doc = Jsoup.parse((String) httpResult);
if (doc.select("div.login-page").size() > 0) {
JianshuSession.getsInstance().validate();
if (JianshuSession.getsInstance().getState() instanceof JianshuSession.LogoutState) {
throw new LoginRequiredException();
}
}
parsePageUserInfo(doc);
return this.getItems(doc);
} else {
JianshuSession.getsInstance().validate();
if (JianshuSession.getsInstance().getState() instanceof JianshuSession.LogoutState) {
throw new LoginRequiredException();
}
}
return null;
}Example 32
| Project: jphp-master File: JsoupExtension.java View source code |
@Override
public void onRegister(CompileScope scope) {
registerClass(scope, WrapJsoup.class);
registerWrapperClass(scope, Connection.class, WrapConnection.class);
registerWrapperClass(scope, Connection.Response.class, WrapConnectionResponse.class);
registerWrapperClass(scope, Connection.Request.class, WrapConnectionRequest.class);
registerWrapperClass(scope, Document.class, WrapDocument.class);
registerWrapperClass(scope, Element.class, WrapElement.class);
registerWrapperClass(scope, Elements.class, WrapElements.class);
MemoryOperation.register(new UrlMemoryOperation());
//MemoryOperation.register(new BinaryMemoryOperation());
}Example 33
| Project: k-9-master File: UriParserTestHelper.java View source code |
public static void assertContainsLink(String expected, StringBuffer actual) {
String linkifiedUri = actual.toString();
Document document = Jsoup.parseBodyFragment(linkifiedUri);
Element anchorElement = document.select("a").first();
assertNotNull("No <a> element found", anchorElement);
assertEquals(expected, anchorElement.text());
assertEquals(expected, anchorElement.attr("href"));
}Example 34
| Project: KinoCast-master File: NowVideo.java View source code |
@Override
public String getVideoPath(DetailActivity.QueryPlayTask queryTask) {
if (TextUtils.isEmpty(url))
return null;
try {
String id = url.substring(url.lastIndexOf("/") + 1);
queryTask.updateProgress(queryTask.getContext().getString(R.string.host_progress_getvideoforid, id));
Document doc = Jsoup.connect("http://www.nowvideo.sx/mobile/video.php?id=" + id).userAgent(Utils.USER_AGENT).timeout(3000).get();
return doc.select("source[type=video/mp4]").attr("src");
} catch (Exception e) {
e.printStackTrace();
}
return null;
}Example 35
| Project: learn_crawler-master File: HtmlParserTool.java View source code |
public static Set<String> extracLinks(String url, LinkFilter filter) {
Set<String> result = new HashSet<String>();
Document doc;
try {
doc = Jsoup.connect(url).timeout(5000).get();
Elements links = doc.select("a[href]");
Elements frames = doc.select("frame[src]");
Elements iframes = doc.select("iframe[src]");
for (Element e : links) {
System.out.println(e.absUrl("href"));
if (filter.accept(e.absUrl("href")))
result.add(e.absUrl("href"));
}
for (Element e : frames) {
if (filter.accept(e.absUrl("src")))
result.add(e.absUrl("src"));
}
for (Element e : iframes) {
if (filter.accept(e.absUrl("src")))
result.add(e.absUrl("src"));
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}Example 36
| Project: like_googleplus_layout-master File: PhoneKRNewsContentUtils.java View source code |
public static LinkedList<String> getPhoneKRNewsDataList(String newsUrl) {
LinkedList<String> data = null;
Document document;
try {
document = Jsoup.connect(newsUrl).get();
Element element = document.getElementById("xs-post");
Elements elements = element.getElementsByTag("p");
if (!elements.isEmpty()) {
data = new LinkedList<String>();
for (int i = 0; i < elements.size(); i++) {
String text = null;
element = elements.get(i);
if (element.getElementsByTag("a").isEmpty()) {
text = FOUR_BLANK_SPACE + element.text();
} else {
if (!element.getElementsByTag("a").get(0).getElementsByTag("img").isEmpty()) {
// System.out.println("图片 = "+element.getElementsByTag("a").get(0).getElementsByTag("img").get(0).attr("src"));
text = element.getElementsByTag("a").get(0).getElementsByTag("img").get(0).attr("src");
}
}
if (!TextUtils.isEmpty(text)) {
data.add(text);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
return data;
}Example 37
| Project: mechanize-master File: JsoupUtilTest.java View source code |
@Test
public void testFindFirstByTagSingleTag() {
Document document = Jsoup.parse("<html><body><a href=\"A\">A</a><a href=\"B\">B</a></body></html>");
assertNotNull(JsoupDataUtil.findFirstByTag(document, "a"));
assertNotNull(JsoupDataUtil.findFirstByTag(document, "body"));
assertNotNull(JsoupDataUtil.findFirstByTag(document, "body/a"));
assertNotNull(JsoupDataUtil.findFirstByTag(document, "html/body/a"));
assertNotNull(JsoupDataUtil.findFirstByTag(document, "html/a"));
assertNull(JsoupDataUtil.findFirstByTag(document, "body/html/a"));
assertNull(JsoupDataUtil.findFirstByTag(document, "body/unknown"));
}Example 38
| Project: mlcomp-master File: TitleMap.java View source code |
@Override
public void map(long recordNum, Record record, TaskContext context) throws IOException {
String url = (String) record.get(0);
String html = (String) record.get(1);
// Boolean isWebshell=QueryWebshell.isWebshell(postdata);
Document doc = Jsoup.parse(html);
Elements links = doc.getElementsByTag("title");
String title = "";
for (Element link : links) {
title = title + "," + link.text();
}
Record result_record = context.createOutputRecord();
result_record.set("url", url);
result_record.set("title", title);
context.write(result_record);
}Example 39
| Project: mobile-ycjw-master File: StudentDevelopmentScheduleQuery.java View source code |
@Override
public String getDevelopmentScheduleQueryInfo(Context context) throws Exception {
try {
YCApplication app = (YCApplication) context.getApplicationContext();
String url = (String) app.get("selectedIp") + Constant.developScheduleQuery;
HttpGet request = new HttpGet(url);
HttpResponse response = app.getClient().execute(request);
InputStream is = response.getEntity().getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(is, Constant.ENCODING));
StringBuilder sb = new StringBuilder();
String temp = null;
while ((temp = br.readLine()) != null) {
sb.append(temp);
}
Document doc = Jsoup.parse(sb.toString());
Elements table = doc.select("#DG_GetGrjh");
return table.toString();
} catch (Exception e) {
throw new Exception(e);
}
}Example 40
| Project: Muzik-master File: SearchDownloadsNL.java View source code |
public static ArrayList<SongResult> getSongs(String query) {
ArrayList<SongResult> temp = new ArrayList<SongResult>();
//base query url.
String u = "http://www.downloads.nl/results/mp3/1/" + Uri.parse(query);
Elements searchResults = new Elements();
try {
Document document = Jsoup.connect(u).get();
searchResults = document.select(".tl");
for (Element x : searchResults) {
String url = "http://www.downloads.nl" + x.attr("href");
//todo add artist string to the name so that result is clearer
URL url2 = new URL(url);
HttpURLConnection ucon = (HttpURLConnection) url2.openConnection();
ucon.setInstanceFollowRedirects(false);
URL secondURL = new URL(ucon.getHeaderField("Location"));
String name = x.select("span").text();
if (HomescreenActivity.debugMode) {
Log.d("Play", "Downloads.nl Name=" + name + " url=" + secondURL);
}
temp.add(new SongResult(name, secondURL.toString()));
}
} catch (IOException e) {
e.printStackTrace();
}
return temp;
}Example 41
| Project: opacclient-master File: ZonesTest.java View source code |
@Test
public void testAccountPages() {
Document page1 = Jsoup.parse(readResource("/zones/medialist/koeln_pages_1.html"));
Document page2 = Jsoup.parse(readResource("/zones/medialist/koeln_pages_2.html"));
page1.setBaseUri(BASE_URL);
page2.setBaseUri(BASE_URL);
String nextPage1 = Zones.findNextPageUrl(page1);
assertNotNull(nextPage1);
assertEquals(nextPage1, "https://katalog.stbib-koeln.de/alswww2" + ".dll/Obj_4051458325195?Style=Portal3&SubStyle=&Lang=GER&ResponseEncoding" + "=utf-8&Method=PageDown&PageSize=10");
String nextPage2 = Zones.findNextPageUrl(page2);
assertNull(nextPage2);
}Example 42
| Project: orcid-update-java-master File: DelegatingMetaScraper.java View source code |
@Override
public IsOrcidWork fetch(String url) throws IOException {
//check to see if we have an ethos ID
if (url.startsWith("uk.bl.ethos")) {
EthosMetaScraper scrape = new EthosMetaScraper();
return scrape.fetch(url);
}
HTMLMetaBuilder builder = cache.getIfPresent(url);
if (builder == null) {
System.out.println("looking up " + url);
Document doc = Jsoup.connect(url).timeout(10000).get();
builder = new HTMLMetaBuilder(doc);
}
return builder.getDublinCoreMeta();
}Example 43
| Project: pack-master File: CrawlerPack.java View source code |
/**
* 將 XML 轉化為 Jsoup Document 物件
*
* Jsoup 1.9.1+ supported non-ascii tag
* -----
* 如果碰到Tag å??稱首å—å…ƒé?ž a-zA-Z çš„å—元,jsoup 會解æž?為註解
* 所以必需用騙的先置入 prefix
* �改寫xmlParse 在回傳時移除prefix
*
* @param xml XML format string
* @return org.jsoup.nodes.Document
*/
public org.jsoup.nodes.Document xmlToJsoupDoc(String xml) {
// Tag 首å—å…ƒé?ž a-zA-Z 時轉化為註解的å•?題
//xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>")
// .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>");
// 將 xml 轉為 jsoup Document 物件
//Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) );
Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser());
jsoupDoc.charset(StandardCharsets.UTF_8);
return jsoupDoc;
}Example 44
| Project: sample-skeleton-projects-master File: MainRunner.java View source code |
public static void main(String[] args) {
String faviconImagePath = "";
Connection conn = Jsoup.connect(URL).timeout(LONG_TIMEOUT);
try {
Document documentObject = conn.get();
System.out.println("URL title: " + documentObject.title());
Element domElement = documentObject.head().select(hrefLink).first();
if (domElement == null) {
domElement = documentObject.head().select(imgMeta).first();
faviconImagePath = domElement.attr("content");
}
System.out.println("Favicon img: " + faviconImagePath);
} catch (IOException e) {
e.printStackTrace();
}
}Example 45
| Project: seldon-server-master File: UrlSectionDynamicExtractor.java View source code |
@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {
String urlWithoutProtocol = url.replace("http://", "");
String[] urlSplit = urlWithoutProtocol.split("/");
if (attributeDetail.extractor_args.isEmpty())
return null;
int sectionNumber = Integer.parseInt(attributeDetail.extractor_args.get(0));
if (!(urlSplit.length > (sectionNumber + 1)))
return null;
return urlSplit[sectionNumber];
}Example 46
| Project: selfoss-android-master File: ArticleContentParser.java View source code |
public List<String> getImagesUrls() {
List<String> imageUrls = new ArrayList<String>();
Document document = Jsoup.parse(article.getContent());
for (Element element : document.getElementsByTag("img")) {
String src = element.attr("src");
if (src != null && !src.isEmpty()) {
imageUrls.add(src);
}
}
return imageUrls;
}Example 47
| Project: SocialConnect-master File: JsoupBaseCrwaler.java View source code |
@Override
public Document crwal(String url) throws IOException {
if (logger.isDebugEnabled()) {
logger.debug("Start crawling data from: " + url);
}
Exception ex = null;
int maxTriesToGetRemoteData = 4;
int tries = 0;
while (tries < maxTriesToGetRemoteData) {
try {
return Jsoup.connect(url).timeout(5000).get();
} catch (IOException e) {
ex = e;
if (logger.isWarnEnabled()) {
logger.warn("Got a " + e.getMessage() + " Exception, try again to fetch data from remote address. Number of previous tries: " + tries + ". At request: " + url);
}
tries++;
}
}
throw new IOException("After " + maxTriesToGetRemoteData + " runs, gave up on fatching data from remote url: " + url, ex);
}Example 48
| Project: stanbol-master File: DOMBuilder.java View source code |
/** * Returns a W3C DOM that exposes the same content as the supplied Jsoup document into a W3C DOM. * @param jsoupDocument The Jsoup document to convert. * @return A W3C Document. */ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { Document document = null; try { /* Obtain the document builder for the configured XML parser. */ DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); /* Create a document to contain the content. */ document = docBuilder.newDocument(); createDOM(jsoupDocument, document, document, new HashMap<String, String>()); } catch (ParserConfigurationException pce) { throw new RuntimeException(pce); } return document; }
Example 49
| Project: stocks-master File: YahooSearchProviderTest.java View source code |
@Test
public void testParsingHtml() throws IOException {
try (Scanner scanner = new Scanner(getClass().getResourceAsStream("response_yahoo_search.txt"), "UTF-8")) {
String html = scanner.useDelimiter("\\A").next();
Document document = Jsoup.parse(html);
List<ResultItem> items = new YahooSearchProvider().extractFrom(document);
assertThat(items.size(), equalTo(20));
ResultItem p = items.get(0);
assertThat(p.getSymbol(), equalTo("D979C.LS"));
assertThat(p.getName(), equalTo("BASF AG/CITI WT 14"));
assertThat(p.getIsin(), equalTo("DE000CF79JW9"));
assertThat(p.getLastTrade(), equalTo(Values.Quote.factorize(0.11)));
assertThat(p.getType(), equalTo("Zertifikate & OS"));
assertThat(p.getExchange(), equalTo("LIS"));
}
}Example 50
| Project: TACIT-master File: SupremCrawlerFilter.java View source code |
public List<String> filters(String segment) throws IOException {
List<String> filterContents = new ArrayList<String>();
URI crawlUrl = URI.create(this.crawlerUrl + "/" + segment);
Document doc = parseContentFromUrl(crawlUrl.toString());
Element itemList = doc.select(".exmenu").get(0);
Elements items = itemList.select("a");
filterContents.add("All");
for (Element element : items) {
filterContents.add(element.attr("href").trim());
}
return filterContents;
}Example 51
| Project: TopNews-master File: NewsDetailsService.java View source code |
public static String getNewsDetails(String url, String news_title, String news_date) {
Document document = null;
String data = "<body>" + "<center><h2 style='font-size:16px;'>" + news_title + "</h2></center>";
data = data + "<p align='left' style='margin-left:10px'>" + "<span style='font-size:10px;'>" + news_date + "</span>" + "</p>";
data = data + "<hr size='1' />";
try {
document = Jsoup.connect(url).timeout(9000).get();
Element element = null;
if (TextUtils.isEmpty(url)) {
data = "";
element = document.getElementById("memberArea");
} else {
element = document.getElementById("artibody");
}
if (element != null) {
data = data + element.toString();
}
data = data + "</body>";
} catch (IOException e) {
e.printStackTrace();
}
return data;
}Example 52
| Project: tori-master File: DOMBuilder.java View source code |
/**
* Returns a W3C DOM that exposes the same content as the supplied Jsoup
* document into a W3C DOM.
*
* @param jsoupDocument
* The Jsoup document to convert.
* @return A W3C Document.
*/
public static Document jsoup2DOM(final org.jsoup.nodes.Document jsoupDocument) {
Document document = null;
try {
/* Obtain the document builder for the configured XML parser. */
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
/* Create a document to contain the content. */
document = docBuilder.newDocument();
createDOM(jsoupDocument, document, document, new HashMap<String, String>());
} catch (ParserConfigurationException pce) {
throw new RuntimeException(pce);
}
return document;
}Example 53
| Project: voj-master File: HtmlTextFilter.java View source code |
/**
* 过滤包å?«HTMLå—符串.
* @param text - 待过滤的å—符串
* @return 过滤å?Žçš„å—符串.
*/
public static String filter(String text) {
if (text == null) {
return text;
}
Document document = Jsoup.parse(text);
document.outputSettings(new Document.OutputSettings().prettyPrint(false));
document.select("br").append("\\n");
document.select("p").prepend("\\n\\n");
String s = document.html().replaceAll("\\\\n", "\n");
return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}Example 54
| Project: WaveTact-master File: Quote.java View source code |
@Override
public void onCommand(String command, User user, PircBotX network, String prefix, Channel channel, boolean isPrivate, int userPermLevel, String... args) throws Exception {
Document doc = Jsoup.connect("http://wwww.quotationspage.com/random.php3").userAgent(Registry.USER_AGENT).get();
String c = doc.select(".quote").get(0).text();
String d = doc.select(".author").get(0).text();
if (d.contains("-")) {
if (!d.contains("("))
d = d.split("-")[0];
else
d = d.split("\\(")[0];
}
IRCUtils.sendMessage(user, network, channel, c + " -" + IRCUtils.noPing(d), prefix);
}Example 55
| Project: storm-crawler-master File: JSoupDOMBuilder.java View source code |
/**
* Returns a W3C DOM that exposes the same content as the supplied Jsoup
* document into a W3C DOM.
*
* @param jsoupDocument
* The Jsoup document to convert.
* @return A W3C Document.
*/
public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) {
Document document;
try {
/* Obtain the document builder for the configured XML parser. */
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
/* Create a document to contain the content. */
document = docBuilder.newDocument();
createDOM(jsoupDocument, document, document, new HashMap<String, String>());
} catch (ParserConfigurationException pce) {
throw new RuntimeException(pce);
}
return document;
}Example 56
| Project: web-crawler-master File: JSoupDOMBuilder.java View source code |
/**
* Returns a W3C DOM that exposes the same content as the supplied Jsoup
* document into a W3C DOM.
*
* @param jsoupDocument
* The Jsoup document to convert.
* @return A W3C Document.
*/
public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) {
Document document;
try {
/* Obtain the document builder for the configured XML parser. */
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
/* Create a document to contain the content. */
document = docBuilder.newDocument();
createDOM(jsoupDocument, document, document, new HashMap<String, String>());
} catch (ParserConfigurationException pce) {
throw new RuntimeException(pce);
}
return document;
}Example 57
| Project: SOCIETIES-Platform-master File: Status.java View source code |
public static Status fromJson(String json) {
Preconditions.checkNotNull(json);
JsonObject obj = (JsonObject) parser.parse(json);
if (obj.get("html") == null)
return null;
Status status = new Status();
String html = obj.get("html").getAsString();
html = StringEscapeUtils.unescapeXml(html);
// use some jsoup magic to parse html and fetch require elements
org.jsoup.nodes.Document document = Jsoup.parse(html);
Element dateElement = document.select("a[class*=tweet-timestamp]").last();
status.setCreatedAt(dateElement.text());
Element textElement = document.select("p[class*=js-tweet-text]").first();
status.setText(textElement.text());
String idRaw = parseUrlGetLastElementInPath(obj.get("url").getAsString());
status.setId(Long.parseLong(idRaw));
status.setScreenName(parseUrlGetLastElementInPath(obj.get("author_url").getAsString()));
// TODO: We need to parse out the other fields.
status.jsonObject = obj;
status.jsonString = json;
return status;
}Example 58
| Project: AcFun-Area63-master File: DocumentRequest.java View source code |
@Override
protected Response<Document> parseNetworkResponse(NetworkResponse response) {
String html;
try {
html = new String(response.data, HttpHeaderParser.parseCharset(response.headers));
return Response.success(parse(html), HttpHeaderParser.parseCacheHeaders(response));
} catch (UnsupportedEncodingException e) {
html = new String(response.data);
return Response.success(parse(html), HttpHeaderParser.parseCacheHeaders(response));
} catch (Exception e) {
return Response.error(new ParseError(e));
}
}Example 59
| Project: ache-master File: GoogleSearch.java View source code |
public List<BackLinkNeighborhood> submitQuery(String query, int page) throws IOException {
timer.waitMinimumDelayIfNecesary();
// 21 -> max number allowed by google... decreases after
String queryUrl = "https://www.google.com/search?q=" + query + "&num=" + docsPerPage + "&start=" + page * docsPerPage;
System.out.println("URL:" + queryUrl);
try {
FetchedResult result = fetcher.get(queryUrl);
InputStream is = new ByteArrayInputStream(result.getContent());
Document doc = Jsoup.parse(is, "UTF-8", query);
is.close();
Elements searchItems = doc.select("div#search");
Elements linkHeaders = searchItems.select(".r");
Elements linksUrl = linkHeaders.select("a[href]");
List<BackLinkNeighborhood> links = new ArrayList<>();
for (Element link : linksUrl) {
String title = link.text();
String url = link.attr("href");
links.add(new BackLinkNeighborhood(url, title));
}
System.out.println(getClass().getSimpleName() + " hits: " + links.size());
return links;
} catch (IOExceptionBaseFetchException | e) {
throw new IOException("Failed to download backlinks from Google.", e);
}
}Example 60
| Project: asoiaf-master File: FetchUrls.java View source code |
public static ImageUrl FetchImageUrl(String url) {
ImageUrl iu = new ImageUrl();
try {
Document doc = Jsoup.connect(url).timeout(5000).get();
Elements e = doc.select("li.outlink a");
for (Element item : e) {
if (item.text().equals("200")) {
//Log.d("","200:"+item.select("a[href]").attr("href"));
iu.setThumbUrl(item.select("a[href]").attr("href"));
}
if (item.text().equals("original")) {
//Log.d("","original:"+item.select("a[href]").attr("href"));
iu.setOringinUrl(item.select("a[href]").attr("href"));
}
}
} catch (Exception e) {
e.printStackTrace();
}
return iu;
}Example 61
| Project: asta4d-master File: ElementNotFoundHandlerOnDocumentTest.java View source code |
@Test
public void notFoundOnDocument() throws Exception {
String html = "<html><body><span>x</span></body></html>";
Document doc = Jsoup.parse(html);
Renderer renderer = Renderer.create();
renderer.add(new ElementNotFoundHandler("div") {
@Override
public Renderer alternativeRenderer() {
return Renderer.create("span", "y");
}
});
RenderUtil.apply(doc, renderer);
Assert.assertEquals(doc.select("span").text(), "y");
}Example 62
| Project: baleen-master File: Jsp101HeadingsTest.java View source code |
@Test
public void testSubjectHeading() {
Document document = Jsoup.parseBodyFragment("<p><b>THIS IS A SUBJECT HEADING</b></p><p>THIS IS A NOT SUBJECT HEADING</p><p>THIS IS not a SUBJECT HEADING</p><p>THIS IS NOT A SUBJECT HEADING EITHER.</p>");
manipulator.manipulate(document);
Elements h1s = document.select("h1");
assertEquals(1, h1s.size());
assertEquals("THIS IS A SUBJECT HEADING", h1s.first().text());
}Example 63
| Project: bank-importer-master File: ItauPoupancaImportador.java View source code |
@Override
public List<BancoRegistro> carregarLancamentosExtrato() {
carregarOpcoesMenu();
String html = /*carregarHtml(poupancaUrl, 200);
html = */
carregarHtml("https://ww70.itau.com.br/M/SaldoPoupanca.aspx", 200);
Document doc = carregarHtmlDeLink(html, "a[href^=SaldoPoupanca]", "Últimos 30 dias");
Element tableExtrato = doc.getElementById("ctl00_ContentPlaceHolder1_Fieldset2");
Iterator<Element> iterator = tableExtrato.select("div.rowPar, div.rowImpar").iterator();
List<BancoRegistro> list = new ArrayList<BancoRegistro>();
while (iterator.hasNext()) {
Element e = iterator.next();
Elements children = e.select("td");
String data = children.get(1).text();
String desc = children.get(2).text().trim();
String val = children.get(3).text();
if (!descricoesIgnorar.contains(desc)) {
list.add(gerarRegistro(data, desc, val));
}
}
return list;
}Example 64
| Project: bennu-master File: Component.java View source code |
public static String process(String origin) {
Document doc = Jsoup.parse(origin);
Elements components = doc.select("[bennu-component]");
for (Element component : components) {
String key = component.attr("bennu-component");
Optional.ofNullable(COMPONENTS.get(key)).ifPresent( x -> component.replaceWith(x.process(component)));
}
return doc.toString();
}Example 65
| Project: CarHome-master File: TMallHomePageDownload.java View source code |
@Override
public Page download(String url) throws IOException {
Preconditions.checkNotNull(url);
URL indexUrl = new URL(url);
Document document = getDocument(url, "UTF-8");
String content = document.html();
Html mainHtml = Html.create(content);
String siteId = mainHtml.regex("site_instance_id=(\\d+)", 1).get();
List<String> asyncIdList = mainHtml.xpath("//div[@class='J_TAsyncModule']/@data-widgetid").all();
List<Html> asyncHtmlList = Lists.newArrayListWithExpectedSize(asyncIdList.size());
for (String id : asyncIdList) {
String aUrl = String.format(ASYNC_URL_TEMPLATE, indexUrl.getHost(), id, indexUrl.getPath(), id, siteId);
Document aDocument = getDocument(aUrl, "UTF-8");
String aHtml = aDocument.html();
String aContent = aHtml.substring(aHtml.indexOf("{"), aHtml.lastIndexOf("}"));
if (LOGGER.isInfoEnabled()) {
LOGGER.info("content is :{}", aContent);
}
asyncHtmlList.add(Html.create(aContent));
sleep(1, 3);
}
return Page.create(url, mainHtml, asyncHtmlList);
}Example 66
| Project: clicker-master File: CN88ProxyGetter.java View source code |
@Override
public Set<Proxy> find() {
final Set<Proxy> ret = new HashSet<Proxy>();
for (int i = 2; i < 11; i++) {
try {
final Document doc = Jsoup.parse(new URL("http://www.cz88.net/proxy/http_" + i + ".aspx"), TIMEOUT);
final Elements tables = doc.getElementsByTag("table");
final Element table = tables.get(0);
final Elements trs = table.getElementsByTag("tr");
for (int j = 1; j < trs.size(); j++) {
final Element tr = trs.get(j);
try {
final Element hostTd = tr.getElementsByTag("td").get(0);
final Element portTd = tr.getElementsByTag("td").get(1);
final String host = hostTd.text();
final int port = Integer.valueOf(portTd.text());
final Proxy proxy = new Proxy(host, port, this.properties);
ret.add(proxy);
} catch (final Exception e) {
}
}
} catch (final Exception e) {
}
}
return ret;
}Example 67
| Project: CN1ML-NetbeansModule-master File: TreeBuilder.java View source code |
protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
Validate.notNull(input, "String input must not be null");
Validate.notNull(baseUri, "BaseURI must not be null");
doc = new Document(baseUri);
reader = new CharacterReader(input);
this.errors = errors;
tokeniser = new Tokeniser(reader, errors);
stack = new DescendableLinkedList<Element>();
this.baseUri = baseUri;
}Example 68
| Project: crawler-master File: DefaultAssetsParser.java View source code |
@Override
public Set<CrawlerURL> getAssets(Document doc, String referer) {
Elements media = doc.select("[src]");
Elements imports = doc.select("link[href]");
Set<CrawlerURL> urls = new HashSet<CrawlerURL>(media.size() + imports.size());
for (Element link : imports) {
urls.add(new CrawlerURL(link.attr("abs:href"), referer));
}
for (Element src : media) {
urls.add(new CrawlerURL(src.attr("abs:src"), referer));
}
return urls;
}Example 69
| Project: curiosity-maps-master File: WebCrawler.java View source code |
protected Document httpGet(Connection conn) {
try {
// TODO: execute network request in a separate thread pool
return conn.get();
} catch (IOException e) {
if (e instanceof HttpStatusException) {
HttpStatusException statusException = (HttpStatusException) e;
if (statusException.getStatusCode() == 503) {
try {
Thread.sleep(backoffTime);
} catch (InterruptedException e1) {
throw new RuntimeException(e1);
}
}
}
throw new RuntimeException(e);
}
}Example 70
| Project: en-webmagic-master File: CssSelector.java View source code |
@Override
public List<String> selectList(String text) {
List<String> strings = new ArrayList<String>();
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
String value = getValue(element);
if (!StringUtils.isEmpty(value)) {
strings.add(value);
}
}
}
return strings;
}Example 71
| Project: EventApp-master File: BazaarEntryLoader.java View source code |
@Override
public void onResponse(String body) {
List<BazaarEntry> entries = new ArrayList<BazaarEntry>();
Document document = Jsoup.parse(body);
Elements elements = document.select("table");
for (Element element : elements) {
BazaarEntry entry = new BazaarEntry();
Elements trs = element.select("tr");
if (trs.size() >= 3) {
entry.setName(trs.get(0).text());
entry.setTitle(trs.get(1).text());
Element summary = trs.get(2);
entry.setSummary(summary.text());
Elements a = summary.select("a");
if (!a.isEmpty()) {
entry.setUrl(a.attr("href"));
}
}
entries.add(entry);
}
listener.onSuccess(entries);
}Example 72
| Project: extentreports-java-master File: SystemAttributeTests.java View source code |
private void performAssertForKVPairs(String key, String value) {
Boolean keyFound = false;
Boolean valueFound = false;
extent.flush();
String html = Reader.readAllText(htmlFilePath);
Document doc = Jsoup.parse(html);
Elements tdColl = doc.select(".environment td");
for (Element td : tdColl) {
if (td.text().equals(key))
keyFound = true;
if (td.text().equals(value))
valueFound = true;
}
Assert.assertTrue(keyFound);
Assert.assertTrue(valueFound);
}Example 73
| Project: FakeWeather-master File: MzituZiPaiFragment.java View source code |
@Override
public List<Girl> call(String url) {
List<Girl> girls = new ArrayList<>();
try {
Document doc = Jsoup.connect(url).timeout(10000).get();
Element total = doc.select("div.postlist").first();
Elements items = total.select("li");
for (Element element : items) {
Girl girl = new Girl(element.select("img").first().attr("src"));
girls.add(girl);
}
} catch (IOException e) {
e.printStackTrace();
}
return girls;
}Example 74
| Project: FudanBBS-master File: TreeBuilder.java View source code |
protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
Validate.notNull(input, "String input must not be null");
Validate.notNull(baseUri, "BaseURI must not be null");
doc = new Document(baseUri);
reader = new CharacterReader(input);
this.errors = errors;
tokeniser = new Tokeniser(reader, errors);
stack = new DescendableLinkedList<Element>();
this.baseUri = baseUri;
}Example 75
| Project: Gazetti_Newspaper_Reader-master File: toi.java View source code |
public String[] getToiArticleContent() {
Document doc;
String[] result = new String[3];
String url = mArticleURL;
try {
Connection connection = Jsoup.connect(url).userAgent("Mozilla").timeout(10 * 1000);
Response response = connection.execute();
if (response == null) {
Crashlytics.log("Is response null ? " + (null == response));
return null;
} else if (response.statusCode() != 200) {
Crashlytics.log("Received response - " + response.statusCode() + " -- " + response.statusMessage());
Crashlytics.log("Received response - " + response.body());
return null;
}
doc = connection.get();
// get Title
String ToiTitleXPath = ConfigService.getInstance().getTOIHead();
titleText = doc.select(ToiTitleXPath).text();
// get HeaderImageUrl
mImageURL = getImageURL(doc);
String ToiArticleXPath = ConfigService.getInstance().getTOIBody();
Element bodyArticleElements = doc.select(ToiArticleXPath).first();
String temp = bodyArticleElements.html().replace("<br />", "$$$");
Document bodyNewLine = Jsoup.parse(temp);
bodyText = bodyNewLine.text().replace("$$$", "\n");
result[0] = titleText;
result[1] = mImageURL;
result[2] = bodyText;
} catch (IOException e) {
Crashlytics.logException(e);
return null;
} catch (NullPointerException npe) {
bodyText = null;
Crashlytics.logException(npe);
return null;
} catch (Exception e) {
Crashlytics.logException(e);
return null;
}
return result;
}Example 76
| Project: gvoa-master File: ItemHtmlParser.java View source code |
public static void parseItemDetail(RssItem item) throws Exception {
/*
if(null==item.getLink())
{
return;
}*/
//String testurl ="http://www.51voa.com/VOA_Standard_English/us-weighs-boosting-training-for-syrian-rebels-52551.html";
String respContent = NetworkUtil.httpGetContent(item.getLink());
Document doc = Jsoup.parse(respContent);
Element mp3link = doc.select("a[id=mp3]").first();
if (mp3link != null) {
Log.i(tag, mp3link.attr("href"));
item.setMp3url(mp3link.attr("href"));
} else {
Log.i(tag, "can't get mp3");
}
Element content = doc.getElementById("content");
Element imageEl = content.select("div.contentImage").first();
if (imageEl != null) {
Log.i(tag, "remove image element from content");
imageEl.remove();
}
String contentStr = content.html();
Log.i(tag, contentStr);
item.setFullText(contentStr);
Element lrclink = content.select("a[id=lrc]").first();
if (lrclink != null) {
Log.i(tag, lrclink.attr("href"));
}
item.setStatus(RssItem.E_PARSE_TXT_OK);
return;
}Example 77
| Project: HabReader-master File: PostShowLoader.java View source code |
@Override
public PostsFullData loadInBackground() {
PostsFullData data = new PostsFullData();
try {
Document document = Jsoup.connect(url).get();
Element title = document.select("span.post_title").first();
Element hubs = document.select("div.hubs").first();
Element content = document.select("div.content").first();
Element date = document.select("div.published").first();
Element author = document.select("div.author > a").first();
if (title != null) {
data.setUrl(url);
data.setTitle(title.text());
data.setHubs(hubs.text());
data.setContent(content.html());
data.setDate(date.text());
data.setAuthor(author.text());
} else
data.setContent(context.getString(R.string.error_404));
} catch (IOException e) {
}
return data;
}Example 78
| Project: HackerNews-master File: UserParser.java View source code |
public static User parseUser(String username) {
try {
User user = new User();
user.username = username;
// don't use user cookie so that "about" text appears correctly
Document page = ConnectionManager.anonConnect("/user?id=" + username).get();
Elements trs = page.select("form > table > tbody > tr");
user.created = trs.select("td:containsOwn(created:) + td").first().text();
user.karma = Integer.parseInt(trs.select("td:containsOwn(karma:) + td").first().text());
try {
user.avg = Float.parseFloat(trs.select("td:containsOwn(avg:) + td").first().text());
} catch (Exception e) {
user.avg = -1.0f;
}
user.aboutHtml = trs.select("td:containsOwn(about:) + td").first().html();
return user;
} catch (IOException e) {
e.printStackTrace();
Log.d(TAG, "IOException parsing UserModel for: " + username);
return null;
} catch (NumberFormatException e) {
e.printStackTrace();
Log.d(TAG, "NumberFormatException parsing UserModel for: " + username);
return null;
} catch (NullPointerException e) {
e.printStackTrace();
Log.d(TAG, "NullPointerException parsing UserModel for: " + username);
return null;
}
}Example 79
| Project: HappyResearch-master File: MTimeCrawler.java View source code |
public void crawl_web_pages() throws Exception {
String filePath = "./src/main/resources/mtime.txt";
List<String> urls = FileIO.readAsList(filePath);
for (String url : urls) {
String html = URLReader.read(url);
Document doc = Jsoup.parse(html);
String name = doc.select("span[property=v:itemreviewed]").text();
name = Strings.filterWebString(name, '_');
String dirPath = dir + name + "/";
FileIO.makeDirectory(dirPath);
FileIO.writeString(dirPath + name + ".html", html);
}
}Example 80
| Project: jabref-master File: ACS.java View source code |
/**
* Tries to find a fulltext URL for a given BibTex entry.
*
* Currently only uses the DOI if found.
*
* @param entry The Bibtex entry
* @return The fulltext PDF URL Optional, if found, or an empty Optional if not found.
* @throws NullPointerException if no BibTex entry is given
* @throws java.io.IOException
*/
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();
// DOI search
Optional<DOI> doi = entry.getField(FieldName.DOI).flatMap(DOI::parse);
if (doi.isPresent()) {
String source = String.format(SOURCE, doi.get().getDOI());
// Retrieve PDF link
Document html = Jsoup.connect(source).ignoreHttpErrors(true).get();
Element link = html.select(".pdf-high-res a").first();
if (link != null) {
LOGGER.info("Fulltext PDF found @ ACS.");
pdfLink = Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
}
}
return pdfLink;
}Example 81
| Project: jacorb-master File: Client.java View source code |
public static void main(String args[]) throws Exception {
String updateString, ior;
if (args.length >= 1) {
updateString = args[0];
} else {
updateString = UUID.randomUUID().toString();
}
// Grab the IOR from the servlet.
Document doc = Jsoup.connect("http://localhost:8080/jacorb-appserver/PrintIOR").get();
ior = doc.select("h1").first().text();
System.out.println("Retrieved ior " + ior);
Properties orbProps = new Properties();
orbProps.setProperty("org.omg.CORBA.ORBClass", "org.jacorb.orb.ORB");
orbProps.setProperty("org.omg.CORBA.ORBSingletonClass", "org.jacorb.orb.ORBSingleton");
orbProps.setProperty("jacorb.interop.null_string_encoding", "true");
ORB orb = ORB.init(args, orbProps);
org.omg.CORBA.Object obj = orb.string_to_object(ior);
GoodDay goodDay = GoodDayHelper.narrow(obj);
// Invoke remote server
System.out.println("Retrieved initial string " + goodDay.get_string());
goodDay.record_string(updateString);
System.out.println("Retrieved string " + goodDay.get_string());
}Example 82
| Project: janglipse-master File: KeywordDocParser.java View source code |
private List<KeywordDocumentation> parse(Document doc) {
List<KeywordDocumentation> list = new ArrayList<KeywordDocumentation>();
Elements tables = doc.select("table.detailHeader");
if (tables.size() > 0) {
for (Element table : tables) {
KeywordDocumentation keyword = new KeywordDocumentation();
Elements header = table.select("td.detailHeaderName");
keyword.setName(header.get(0).text());
keyword.setDescription(table.nextSibling().outerHtml());
list.add(keyword);
}
}
return list;
}Example 83
| Project: java-manga-reader-master File: MangaUtil.java View source code |
/**
* Retrieves a list of licensed Manga from Anime News Network.
* @return A list of Manga licensed in English.
* @throws IOException If it cannot complete the request.
*/
public static List<String> getLicensedManga() throws IOException {
StringBuilder sb = new StringBuilder("http://www.animenewsnetwork.com/encyclopedia/anime-list.php");
sb.append("?licensed=1");
sb.append("&sort=title");
sb.append("&showG=1");
Document doc = Jsoup.connect(sb.toString()).maxBodySize(0).get();
Elements list = doc.getElementsByClass("HOVERLINE");
List<String> blackList = new ArrayList<String>(list.size());
for (Element e : list) {
String title = e.text();
if (title.startsWith("(The)")) {
title = title.replace("(The)", "The");
}
if (title.contains("(")) {
title = title.substring(0, title.lastIndexOf('(')).trim();
}
blackList.add(title);
}
return blackList;
}Example 84
| Project: JAVMovieScraper-master File: Data18SharedMethods.java View source code |
//Used to implement the SecurityPassthrough interface for both data18 scrapers public static Document runSecurityPassthrough(Document document, SearchResult originalSearchResult) { //find the first link in the document, download the href, then try to download the original result again if (document != null) { Element firstLink = document.select("a").first(); if (firstLink != null && firstLink.attr("href") != null) { Document captchaSolved = SiteParsingProfile.downloadDocument(new SearchResult(firstLink.attr("href"))); if (captchaSolved != null) { return SiteParsingProfile.downloadDocument(originalSearchResult); } } } return document; }
Example 85
| Project: JCommons-master File: DownloaderTest.java View source code |
public static void main(String[] args) throws IOException {
Document doc = Jsoup.connect("http://meta.stackexchange.com/questions/134495/academic-papers-using-stack-exchange-data").get();
Elements eles = doc.getElementsContainingText("[PDF]");
eles.addAll(doc.getElementsContainingText("[arXiv]"));
String folderName = "D:/dl";
for (Element ele : eles) {
String src = ele.attr("href");
if (src == null || src.trim().equals(""))
continue;
URL url = new URL(src);
Element parent = ele.parent();
Elements eles1 = parent.getElementsByTag("strong");
Element nameEle = eles1.get(0);
String fileName = nameEle.text().replace(":", " ").replace("\"", "").replace("'", "").replace("?", "");
if (fileName.contains("Fit or"))
continue;
if (!fileName.endsWith("."))
fileName = fileName.concat(".");
fileName = fileName.concat("pdf");
System.out.println(fileName);
InputStream in = null;
try {
in = url.openStream();
} catch (Exception e) {
continue;
}
OutputStream out = new BufferedOutputStream(new FileOutputStream(folderName + "/" + fileName));
for (int b; (b = in.read()) != -1; ) {
out.write(b);
}
out.close();
in.close();
}
}Example 86
| Project: jeboorker-master File: ComicsOrgDownloader.java View source code |
@Override
public List<MetadataDownloadEntry> search(String phrase) {
try {
List<URL> searchUrl = MetadataDownloadUtils.getSearchPageUrls(phrase, PAGES_TO_LOAD, QUERY_URL);
List<byte[]> pageHtmlContent = MetadataDownloadUtils.loadPages(searchUrl, PAGES_TO_LOAD);
List<Document> htmlDocs = MetadataDownloadUtils.getDocuments(pageHtmlContent, MAIN_URL);
List<String> searchResultLinks = findSearchResultLinks(htmlDocs);
List<byte[]> metadataHtmlContent = MetadataDownloadUtils.loadLinkContent(searchResultLinks, MAIN_URL);
return getMetadataDownloadEntries(metadataHtmlContent);
} catch (IOException e) {
LoggerFactory.getLogger(this).log(Level.WARNING, "Failed to fetch metadata for search '" + phrase + "'", e);
}
return null;
}Example 87
| Project: JKuuza-master File: ConditionsResolver.java View source code |
/** * Apply all conditions on this Document * * @param doc Jsoup Document with html code of a web page * @return true if ALL conditions pass, false if AT LEAST ONE not pass * @throws InstantiationException * @throws IllegalAccessException * @throws ClassNotFoundException * @throws IllegalArgumentException * @throws InvocationTargetException * @throws NoSuchMethodException */ public boolean resolve(Document doc) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException { boolean okStatus = true; for (Iterator<Condition> it = conditions.iterator(); it.hasNext(); ) { Condition condition = it.next(); String[] params = new String[condition.getParams().size()]; condition.getParams().toArray(params); condition.getConditionObject().setDocument(doc); Object result = Reflector.call(condition.getConditionObject(), condition.getFunctionName(), params); if (!result.toString().equals(condition.getExpectedValue())) { okStatus = false; failedConditions.add(condition); } } return okStatus; }
Example 88
| Project: karma-exchange-master File: SalesforceUtil.java View source code |
private static void updateSalesforceCdnImgLinks(Document doc, EventSourceInfo sourceInfo) {
Elements imgs = doc.getElementsByTag("img");
for (Element img : imgs) {
URI uri = null;
try {
uri = new URI(img.attr("src"));
} catch (URISyntaxException e) {
}
if (uri != null) {
String domain = uri.getHost();
if (domain.toLowerCase().endsWith(IMG_CDN_DOMAIN)) {
img.attr("src", "https://" + sourceInfo.getDomain() + uri.getPath() + "?" + uri.getQuery());
}
}
}
}Example 89
| Project: kempes-master File: EvomagProductExtractor.java View source code |
@Override
public void map(WebPage page, Product object) throws Exception {
Document document = page.getDocument();
// parse for title
Element element = document.select("div h1").first();
object.setName(element.text());
// parse for price
element = document.select("div.pret_ron").first();
String tmp = element.ownText();
tmp = tmp.split(" ")[0];
object.setPrice(parsePrice(tmp));
}Example 90
| Project: ManalithBot-master File: TranslatorPlugin.java View source code |
@BotCommand("번ì—")
public String translate(@Option(name = "ko|en...", help = "번ì—í• ëŒ€ìƒ? 언어") String to, @Option(name = "메시지", help = "번ì—í• ë©”ì‹œì§€") String message) {
final String url = "https://api.datamarket.azure.com/Bing/MicrosoftTranslator/v1/Translate?Text='%s'&To='%s'";
String login = "USER_ID_IGNORED:" + clientSecret;
String base64login = new String(Base64.encodeBase64(login.getBytes()));
try {
Document doc = Jsoup.connect(String.format(url, message, to)).header("Authorization", "Basic " + base64login).ignoreContentType(true).get();
logger.debug("response", doc);
Elements elem = doc.select("d|text[m:type=Edm.String]");
return elem.text();
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
return "번ì—í• ë‚´ìš©ì?´ 없습니다.";
}Example 91
| Project: mayocat-shop-master File: DefaultPdfTemplateRenderer.java View source code |
@Override
public void generatePDF(OutputStream outputStream, Path template, Path renderingRoot, Map<String, Object> context) throws PdfRenderingException {
ITextRenderer renderer = new ITextRenderer();
try {
String html = templateRenderer.renderAsString(template, context);
// Ensure we have a valid XHTML document using JSoup
Document jsoupDoc = Jsoup.parse(html);
jsoupDoc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
jsoupDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
jsoupDoc.outputSettings().charset("UTF-8");
String path = renderingRoot.toAbsolutePath().toUri().toString();
renderer.setDocumentFromString(jsoupDoc.toString(), path);
renderer.layout();
renderer.createPDF(outputStream);
} catch (DocumentExceptionTemplateRenderingException | e) {
throw new PdfRenderingException(e);
}
}Example 92
| Project: medium-textview-master File: JsoupUtils.java View source code |
public static List<String> findAllVideoLinks(String content) {
final List<String> links = new ArrayList<>();
final Document document = Jsoup.parse(content);
Elements medias = document.select("[src]");
for (Element element : medias) {
if (element.tagName().equals("iframe")) {
links.add(element.attr("abs:src"));
} else {
}
}
return links;
}Example 93
| Project: memorabilia-master File: PostTag.java View source code |
public Elements selectFrom(Document doc) {
Elements elements = new Elements();
Elements fromXpath = doc.select(xpath);
elements.addAll(fromXpath);
Elements scripts = doc.getElementsByTag("script");
for (Element script : scripts) {
String html = script.html();
if (html.contains(pattern)) {
elements.add(script);
}
}
return elements;
}Example 94
| Project: mensaapp-master File: WeeklyMenuTask.java View source code |
@Override
protected Pair<WeeklyMenu, Exception> doInBackground(String... urls) {
List<WeeklyMenu> menus = new ArrayList<WeeklyMenu>();
for (String url : urls) {
try {
Document document = Jsoup.connect(url).get();
WeeklyMenuParser parser = WeeklyMenuParser.create(context, document, mensa);
menus.add(parser.parse());
} catch (WeeklyMenuParseException wmpe) {
Log.w(TAG, String.format(context.getString(R.string.error_menu_parse), url), wmpe);
return new Pair<WeeklyMenu, Exception>(null, wmpe);
} catch (Exception e) {
Log.e(TAG, String.format(context.getString(R.string.error_menu_download), url), e);
return new Pair<WeeklyMenu, Exception>(null, e);
}
}
return new Pair<WeeklyMenu, Exception>(WeeklyMenu.merge(mensa, Utils.now(), menus), null);
}Example 95
| Project: meta-server-master File: ServerHtmlContentTest.java View source code |
@Test
public void testShowHtml() throws IOException {
String url = URL_BASE + "/servers/show";
Document doc = Jsoup.connect(url).get();
Element table = doc.getElementById("server-list");
Assert.assertTrue(table.nodeName().equals("table"));
Element tableBody = table.select("tbody").first();
Element firstRow = tableBody.select("tr").first();
Assert.assertEquals(firstEntry.getName(), firstRow.getElementsByClass("server-name").first().text());
Assert.assertEquals(firstEntry.getOwner(), firstRow.getElementsByClass("server-owner").first().text());
Assert.assertEquals("" + firstEntry.getPort(), firstRow.getElementsByClass("server-port").first().text());
Assert.assertEquals(firstEntry.getAddress(), firstRow.getElementsByClass("server-address").first().text());
}Example 96
| Project: mini-blog-master File: YouKuVideoHandler.java View source code |
/**
* 获�优酷视频
*
* @param url
* 视频URL
*/
public Video getVideo(String url) {
if (url.indexOf("v.youku.com") != -1) {
try {
Document doc = VideoUtil.getURLContent(url);
/**
* 获å?–è§†é¢‘æ ‡é¢˜
*/
String title = doc.title();
/**
* 获�视频缩略图
*/
String pic = VideoUtil.getElementAttrById(doc, "s_sina", "href");
int local = pic.indexOf("pic=");
pic = pic.substring(local + 4);
/**
* 获�视频地�
*/
String flash = VideoUtil.getElementAttrById(doc, "link2", "value");
/**
* 获�视频时间
*/
String time = VideoUtil.getElementAttrById(doc, "download", "href");
if (time != null && !"".equals(time)) {
String[] arrays = time.split("\\|");
time = arrays[4];
}
Video video = new Video();
video.setPic(pic);
video.setFlash(flash);
video.setTime("");
video.setTitle(title);
return video;
} catch (Exception e) {
logger.error("---------------->error is " + e.getMessage());
e.printStackTrace();
}
} else if (this.successor != null) {
return this.successor.getVideo(url);
}
return null;
}Example 97
| Project: mylyn.docs-master File: DocumentProcessorTest.java View source code |
@Test
public void testNormalizeTextNodes() {
Document document = new Document("");
Element element = document.appendElement("root");
element.appendText("first ");
element.appendText("second,");
element.appendText(" third");
element.appendElement("break");
element.appendText("fourth");
assertEquals(5, element.childNodes().size());
TestDocumentProcessor.normalizeTextNodes(element);
assertEquals(3, element.childNodes().size());
assertTrue(element.childNode(0) instanceof TextNode);
assertEquals("first second, third", ((TextNode) element.childNode(0)).text());
assertTrue(element.childNode(2) instanceof TextNode);
assertEquals("fourth", ((TextNode) element.childNode(2)).text());
}Example 98
| Project: myrobotlab-master File: JSoupExtractor.java View source code |
@Override public List<Document> processDocument(Document doc) { for (Object o : doc.getField(htmlField)) { org.jsoup.nodes.Document jSoupDoc = Jsoup.parse(o.toString()); Elements links = jSoupDoc.select(jSoupSelector); for (Element link : links) { doc.addToField(outputField, link); } } return null; }
Example 99
| Project: NewsStats-master File: NewYorkTimesContentHandler.java View source code |
@Override
public List extractArticles(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
System.out.println("Current URL: " + page.getWebURL());
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
Document doc = Jsoup.parseBodyFragment(html);
Element articleElement = doc.getElementById("story");
if (articleElement == null) {
// if no article can be found
return articles;
}
String title = articleElement.getElementById("story-heading").ownText();
String dateString = articleElement.getElementsByClass("dateline").first().attr("datetime");
Date date = null;
try {
date = new SimpleDateFormat("yyyy-MM-dd").parse(dateString);
} catch (ParseException e) {
e.printStackTrace();
}
String author = articleElement.getElementsByClass("byline-author").first().ownText();
String content = "";
Elements contentElements = articleElement.select("p.story-body-text.story-content");
for (Element contentElement : contentElements) {
content += contentElement.ownText();
}
if (!filterArticles(content)) {
// ignore the article if filter does not approve
return articles;
}
Article article = new NewYorkTimesArticle();
article.setTitle(title);
article.setCreatedDate(date);
article.setAuthor(author);
article.setContent(content);
articles.add(article);
}
return articles;
}Example 100
| Project: nocket-master File: AbstractHtmlVisitor.java View source code |
protected void checkAndAddGroupTabbedPanel(Document document) {
if (!(getContext().getFileAndClassNameStrategy() instanceof GroupNameFileAndClassNameStrategy)) {
return;
}
GroupNameFileAndClassNameStrategy strategy = (GroupNameFileAndClassNameStrategy) getContext().getFileAndClassNameStrategy();
if (!strategy.isDomainObjectWithGroupAnnotations() || !strategy.isStrategyForMainObject()) {
return;
}
if (document.getElementsByAttributeValue("wicket:id", "groupTabbedPanel").isEmpty()) {
Elements elementsByTag = document.getElementsByTag("form");
if (!elementsByTag.isEmpty()) {
org.jsoup.nodes.Element element = elementsByTag.first();
org.jsoup.nodes.Element ulElement = element.appendElement("ul");
ulElement.attr("wicket:id", "groupTabbedPanel");
}
}
}Example 101
| Project: org.eclipse.mylyn.docs-master File: RemoveEmptySpansProcessor.java View source code |
@Override
public void process(Document document) {
Element body = document.body();
boolean modifiedOne = false;
do {
modifiedOne = false;
// remove empty spans, and eliminate tags that only contain whitespace
for (Element element : body.getAllElements()) {
if (Html.isSpanElement(element)) {
// remove span with no children
List<Node> childNodes = element.childNodes();
if (childNodes.isEmpty()) {
element.remove();
modifiedOne = true;
} else {
// a span with a single text child that is only whitespace is removed (text is retained)
if (childNodes.size() == 1) {
Node node = childNodes.get(0);
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.text();
if (text.trim().length() == 0) {
textNode.remove();
element.before(textNode);
element.remove();
modifiedOne = true;
}
normalizeTextNodes((Element) textNode.parent());
}
}
}
}
// a br within a span that is a first or last child is moved out
Element parent = element.parent();
if (//$NON-NLS-1$
element.tagName().equalsIgnoreCase("br") && Html.isSpanElement(parent)) {
List<Node> childNodes = parent.childNodes();
if (childNodes.get(0) == element) {
element.remove();
parent.before(element);
modifiedOne = true;
} else if (childNodes.get(childNodes.size() - 1) == element) {
element.remove();
parent.after(element);
modifiedOne = true;
}
}
}
} while (modifiedOne);
}