Java Examples for org.jsoup.nodes.Node
The following java examples will help you to understand the usage of org.jsoup.nodes.Node. These source code samples are taken from different open source projects.
Example 1
| Project: mechanize-master File: JsoupNodeHelper.java View source code |
@Override public Index getIndexInParent(final Node node, final boolean byType) { String type = byType ? getName(node) : Selector.UNIVERSAL_TAG; List<? extends Node> children; Node parent = node.parent(); if (parent == null) children = Collections.emptyList(); else children = getChildNodes(parent, type); return new Index(children.indexOf(node), children.size()); }
Example 2
| Project: serverside-elements-master File: RootImpl.java View source code |
private void addCommand(String name, Node target, JsonValue... params) {
assert target == null || target.getRoot() == this;
JsonArray c = Json.createArray();
c.set(0, name);
if (target != null) {
c.set(1, nodeToId.get(target).doubleValue());
}
Arrays.asList(params).forEach( p -> c.set(c.length(), p));
pendingCommands.set(pendingCommands.length(), c);
owner.markAsDirty();
}Example 3
| Project: metricminer2-master File: HtmlNodeVisitor.java View source code |
private void visitNodeByClass(Node node) {
Method visitorMethod = findVisitorMethodForNodeClass(node);
if (visitorMethod != null && visitorMethod.getReturnType().equals(void.class)) {
try {
visitorMethod.invoke(this, node);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}Example 4
| Project: moulder-j-master File: SubMoulder.java View source code |
public List<Node> process(Element element) { final Document doc = new Document(element.baseUri()); final Element copy = JsoupHelper.copy(element); doc.appendChild(copy); for (TemplatorConfig c : registry.getConfig()) { Elements elements = copy.select(c.selector); for (Element e : elements) { Collection<Node> oes = MouldersApplier.applyMoulders(c.templators, Arrays.<Node>asList(e)); // replace e with oes for (Node oe : oes) { e.before(oe.outerHtml()); } e.remove(); } } return doc.childNodes(); }
Example 5
| Project: CN1ML-NetbeansModule-master File: ElementsTest.java View source code |
@Test
public void traverse() {
Document doc = Jsoup.parse("<div><p>Hello</p></div><div>There</div>");
final StringBuilder accum = new StringBuilder();
doc.select("div").traverse(new NodeVisitor() {
public void head(Node node, int depth) {
accum.append("<" + node.nodeName() + ">");
}
public void tail(Node node, int depth) {
accum.append("</" + node.nodeName() + ">");
}
});
assertEquals("<div><p><#text></#text></p></div><div><#text></#text></div>", accum.toString());
}Example 6
| Project: Crud2Go-master File: LoadingIndicatorBootstrapListener.java View source code |
@Override
public void modifyBootstrapFragment(BootstrapFragmentResponse response) {
String message = getMessage(response);
if (!Strings.isNullOrEmpty(message)) {
List<Node> nodes = response.getFragmentNodes();
for (Node node : nodes) {
if (isMainDiv(node)) {
addLoadingIndicator((Element) node, message);
}
}
}
}Example 7
| Project: FudanBBS-master File: Parser.java View source code |
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
// the node list gets modified when re-parented
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}Example 8
| Project: jsoup-master File: ElementsTest.java View source code |
@Test
public void traverse() {
Document doc = Jsoup.parse("<div><p>Hello</p></div><div>There</div>");
final StringBuilder accum = new StringBuilder();
doc.select("div").traverse(new NodeVisitor() {
public void head(Node node, int depth) {
accum.append("<" + node.nodeName() + ">");
}
public void tail(Node node, int depth) {
accum.append("</" + node.nodeName() + ">");
}
});
assertEquals("<div><p><#text></#text></p></div><div><#text></#text></div>", accum.toString());
}Example 9
| Project: validadorAcessibilidade-master File: Parser.java View source code |
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
// the node list gets modified when re-parented
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}Example 10
| Project: zafu_jwc-master File: Parser.java View source code |
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
// the node list gets modified when re-parented
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}Example 11
| Project: Vega-master File: NodeImpl.java View source code |
static NodeImpl createFromJsoupNode(org.jsoup.nodes.Node node, Document ownerDocument) {
if (node == null)
return null;
else if (node instanceof org.jsoup.nodes.Element)
return HTMLElementImpl.create((Element) node, ownerDocument);
else if (node instanceof org.jsoup.nodes.TextNode)
return new TextImpl((org.jsoup.nodes.TextNode) node, ownerDocument);
else if (node instanceof org.jsoup.nodes.Comment)
return new CommentImpl((org.jsoup.nodes.Comment) node, ownerDocument);
else if (node instanceof org.jsoup.nodes.DataNode)
return new CharacterDataImpl((DataNode) node, ((DataNode) node).getWholeData(), ownerDocument);
else
return new NodeImpl(node, ownerDocument);
}Example 12
| Project: alfresco-apache-storm-demo-master File: JSoupDOMBuilder.java View source code |
/**
* The internal helper that copies content from the specified Jsoup <tt>Node</tt> into a W3C
* {@link Node}.
*
* @param node The Jsoup node containing the content to copy to the specified W3C {@link Node}.
* @param out The W3C {@link Node} that receives the DOM content.
*/
public static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) {
if (node instanceof org.jsoup.nodes.Document) {
org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
for (org.jsoup.nodes.Node n : d.childNodes()) {
createDOM(n, out, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
org.w3c.dom.Element _e = doc.createElement(e.tagName());
out.appendChild(_e);
org.jsoup.nodes.Attributes atts = e.attributes();
for (org.jsoup.nodes.Attribute a : atts) {
String attName = a.getKey();
// omit xhtml namespace
if (attName.equals("xmlns")) {
continue;
}
String attPrefix = getNSPrefix(attName);
if (attPrefix != null) {
if (attPrefix.equals("xmlns")) {
ns.put(getLocalName(attName), a.getValue());
} else if (!attPrefix.equals("xml")) {
String namespace = ns.get(attPrefix);
if (namespace == null) {
// fix attribute names looking like qnames
attName = attName.replace(':', '_');
}
}
}
_e.setAttribute(attName, a.getValue());
}
for (org.jsoup.nodes.Node n : e.childNodes()) {
createDOM(n, _e, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
if (!(out instanceof Document)) {
out.appendChild(doc.createTextNode(t.text()));
}
}
}Example 13
| Project: stanbol-master File: DOMBuilder.java View source code |
/** * The internal helper that copies content from the specified Jsoup <tt>Node</tt> into a W3C {@link Node}. * @param node The Jsoup node containing the content to copy to the specified W3C {@link Node}. * @param out The W3C {@link Node} that receives the DOM content. */ private static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) { if (node instanceof org.jsoup.nodes.Document) { org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node); for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out, doc, ns); } } else if (node instanceof org.jsoup.nodes.Element) { org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node); org.w3c.dom.Element _e = doc.createElement(e.tagName()); out.appendChild(_e); org.jsoup.nodes.Attributes atts = e.attributes(); for (org.jsoup.nodes.Attribute a : atts) { String attName = a.getKey(); //omit xhtml namespace if (attName.equals("xmlns")) { continue; } String attPrefix = getNSPrefix(attName); if (attPrefix != null) { if (attPrefix.equals("xmlns")) { ns.put(getLocalName(attName), a.getValue()); } else if (!attPrefix.equals("xml")) { String namespace = ns.get(attPrefix); if (namespace == null) { //fix attribute names looking like qnames attName = attName.replace(':', '_'); } } } _e.setAttribute(attName, a.getValue()); } for (org.jsoup.nodes.Node n : e.childNodes()) { createDOM(n, _e, doc, ns); } } else if (node instanceof org.jsoup.nodes.TextNode) { org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node); if (!(out instanceof Document)) { out.appendChild(doc.createTextNode(t.text())); } } }
Example 14
| Project: tori-master File: DOMBuilder.java View source code |
/**
* The internal helper that copies content from the specified Jsoup
* <tt>Node</tt> into a W3C {@link Node}.
*
* @param node
* The Jsoup node containing the content to copy to the specified
* W3C {@link Node}.
* @param out
* The W3C {@link Node} that receives the DOM content.
*/
private static void createDOM(final org.jsoup.nodes.Node node, final Node out, final Document doc, final Map<String, String> ns) {
if (node instanceof org.jsoup.nodes.Document) {
org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
for (org.jsoup.nodes.Node n : d.childNodes()) {
createDOM(n, out, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
org.w3c.dom.Element _e = doc.createElement(e.tagName());
out.appendChild(_e);
org.jsoup.nodes.Attributes atts = e.attributes();
for (org.jsoup.nodes.Attribute a : atts) {
String attName = a.getKey();
// omit xhtml namespace
if (attName.equals("xmlns")) {
continue;
}
String attPrefix = getNSPrefix(attName);
if (attPrefix != null) {
if (attPrefix.equals("xmlns")) {
ns.put(getLocalName(attName), a.getValue());
} else if (!attPrefix.equals("xml")) {
String namespace = ns.get(attPrefix);
if (namespace == null) {
// fix attribute names looking like qnames
attName = attName.replace(':', '_');
}
}
}
_e.setAttribute(attName, a.getValue());
}
for (org.jsoup.nodes.Node n : e.childNodes()) {
createDOM(n, _e, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
if (!(out instanceof Document)) {
out.appendChild(doc.createTextNode(t.text()));
}
}
}Example 15
| Project: sisob-academic-data-extractor-master File: cUtils.java View source code |
public static String lookPatternByText(boolean bReverse, org.jsoup.nodes.Node lst1, org.jsoup.nodes.Node lst2) { String s1 = lst1.toString(); String s2 = lst2.toString(); String sPattern = ""; boolean bEnd = false; int i = 0; while (!bEnd && i < s1.length()) { int iAux = bReverse ? s1.length() - 1 - i : i; int iAux2 = bReverse ? s2.length() - 1 - i : i; if (s1.charAt(iAux) == s2.charAt(iAux2)) { if (bReverse) sPattern = s1.charAt(iAux) + sPattern; else sPattern += s1.charAt(iAux); i++; } else { bEnd = true; } } return sPattern; }
Example 16
| Project: jresponder-master File: TextUtil.java View source code |
/* ====================================================================== */
/**
* @param cell element that contains whitespace formatting
* @return
*/
public String getWholeText(Element cell) {
String text = null;
List<Node> childNodes = cell.childNodes();
if (childNodes.size() > 0) {
Node childNode = childNodes.get(0);
if (childNode instanceof TextNode) {
text = ((TextNode) childNode).getWholeText();
}
}
if (text == null) {
text = cell.text();
}
return text;
}Example 17
| Project: mbox_tools-master File: HTMLStripUtil.java View source code |
@Override
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
// non breaking space
String text = textNode.text().replace(' ', ' ').trim();
if (!text.isEmpty()) {
buffer.append(text);
if (!text.endsWith(" ")) {
// the last text gets appended the extra space too but we remove it later
buffer.append(" ");
}
}
}
}Example 18
| Project: mylyn.docs-master File: WhitespaceCleanupProcessor.java View source code |
private void moveLeadingOrTrailingSpaceOutOfElements(Element body) {
Set<Node> affectedParents = new HashSet<Node>();
for (Element element : body.getAllElements()) {
if (!Html.isWhitespacePreserve(element)) {
normalizeTextNodes(element);
List<Node> children = element.childNodes();
if (!children.isEmpty()) {
Node firstChild = children.get(0);
if (firstChild instanceof TextNode) {
TextNode textNode = (TextNode) firstChild;
String text = textNode.getWholeText();
int nonWhitespaceIndex = firstIndexOfNonWhitespace(text);
if (nonWhitespaceIndex > 0) {
affectedParents.add(textNode.parent());
// split
textNode.splitText(nonWhitespaceIndex);
// move outside
textNode.remove();
computeBeforeTarget(element).before(textNode);
affectedParents.add(textNode.parent());
} else if (nonWhitespaceIndex == -1) {
// move outside
textNode.remove();
computeAfterTarget(element).after(textNode);
affectedParents.add(textNode.parent());
}
}
normalizeEmptySpaceBetweenNodes(element);
children = element.childNodes();
if (!children.isEmpty()) {
Node lastChild = children.get(children.size() - 1);
if (lastChild instanceof TextNode) {
TextNode textNode = (TextNode) lastChild;
String text = textNode.getWholeText();
int lastNonWhitespaceIndex = lastIndexOfNonWhitespace(text);
if (lastNonWhitespaceIndex < 0) {
// move outside
textNode.remove();
computeAfterTarget(element).after(textNode);
affectedParents.add(textNode.parent());
} else if (lastNonWhitespaceIndex < (text.length() - 1)) {
affectedParents.add(textNode.parent());
// split
textNode.splitText(lastNonWhitespaceIndex + 1);
// move outside
textNode = (TextNode) textNode.nextSibling();
textNode.remove();
computeAfterTarget(element).after(textNode);
affectedParents.add(textNode.parent());
}
}
}
}
if (!affectedParents.isEmpty()) {
for (Node parent : affectedParents) {
if (parent instanceof Element) {
normalizeTextNodes((Element) parent);
}
}
affectedParents.clear();
}
}
}
}Example 19
| Project: org.eclipse.mylyn.docs-master File: WhitespaceCleanupProcessor.java View source code |
private void moveLeadingOrTrailingSpaceOutOfElements(Element body) {
Set<Node> affectedParents = new HashSet<Node>();
for (Element element : body.getAllElements()) {
if (!Html.isWhitespacePreserve(element)) {
normalizeTextNodes(element);
List<Node> children = element.childNodes();
if (!children.isEmpty()) {
Node firstChild = children.get(0);
if (firstChild instanceof TextNode) {
TextNode textNode = (TextNode) firstChild;
String text = textNode.getWholeText();
int nonWhitespaceIndex = firstIndexOfNonWhitespace(text);
if (nonWhitespaceIndex > 0) {
affectedParents.add(textNode.parent());
// split
textNode.splitText(nonWhitespaceIndex);
// move outside
textNode.remove();
computeBeforeTarget(element).before(textNode);
affectedParents.add(textNode.parent());
} else if (nonWhitespaceIndex == -1) {
// move outside
textNode.remove();
computeAfterTarget(element).after(textNode);
affectedParents.add(textNode.parent());
}
}
normalizeEmptySpaceBetweenNodes(element);
children = element.childNodes();
if (!children.isEmpty()) {
Node lastChild = children.get(children.size() - 1);
if (lastChild instanceof TextNode) {
TextNode textNode = (TextNode) lastChild;
String text = textNode.getWholeText();
int lastNonWhitespaceIndex = lastIndexOfNonWhitespace(text);
if (lastNonWhitespaceIndex < 0) {
// move outside
textNode.remove();
computeAfterTarget(element).after(textNode);
affectedParents.add(textNode.parent());
} else if (lastNonWhitespaceIndex < (text.length() - 1)) {
affectedParents.add(textNode.parent());
// split
textNode.splitText(lastNonWhitespaceIndex + 1);
// move outside
textNode = (TextNode) textNode.nextSibling();
textNode.remove();
computeAfterTarget(element).after(textNode);
affectedParents.add(textNode.parent());
}
}
}
}
if (!affectedParents.isEmpty()) {
for (Node parent : affectedParents) {
if (parent instanceof Element) {
normalizeTextNodes((Element) parent);
}
}
affectedParents.clear();
}
}
}
}Example 20
| Project: SMSnatcher-master File: ParseUtils.java View source code |
// Lovingly borrowed from https://gist.github.com/491407 public static void removeComments(Node node) { // as we are removing child nodes while iterating, we cannot use a normal foreach over children, // or will get a concurrent list modification error. int i = 0; while (i < node.childNodes().size()) { Node child = node.childNode(i); if (child.nodeName().equals("#comment")) child.remove(); else { removeComments(child); i++; } } }
Example 21
| Project: WebCollector-master File: ContentExtractor.java View source code |
protected CountInfo computeInfo(Node node) { if (node instanceof Element) { Element tag = (Element) node; CountInfo countInfo = new CountInfo(); for (Node childNode : tag.childNodes()) { CountInfo childCountInfo = computeInfo(childNode); countInfo.textCount += childCountInfo.textCount; countInfo.linkTextCount += childCountInfo.linkTextCount; countInfo.tagCount += childCountInfo.tagCount; countInfo.linkTagCount += childCountInfo.linkTagCount; countInfo.leafList.addAll(childCountInfo.leafList); countInfo.densitySum += childCountInfo.density; countInfo.pCount += childCountInfo.pCount; } countInfo.tagCount++; String tagName = tag.tagName(); if (tagName.equals("a")) { countInfo.linkTextCount = countInfo.textCount; countInfo.linkTagCount++; } else if (tagName.equals("p")) { countInfo.pCount++; } int pureLen = countInfo.textCount - countInfo.linkTextCount; int len = countInfo.tagCount - countInfo.linkTagCount; if (pureLen == 0 || len == 0) { countInfo.density = 0; } else { countInfo.density = (pureLen + 0.0) / len; } infoMap.put(tag, countInfo); return countInfo; } else if (node instanceof TextNode) { TextNode tn = (TextNode) node; CountInfo countInfo = new CountInfo(); String text = tn.text(); int len = text.length(); countInfo.textCount = len; countInfo.leafList.add(len); return countInfo; } else { return new CountInfo(); } }
Example 22
| Project: XCoLab-master File: EmailNotification.java View source code |
@Override protected Node resolvePlaceholderTag(Element tag) { final Node node = super.resolvePlaceholderTag(tag); if (node != null) { return node; } Contest contest = getContest(); Proposal proposal = getProposal(); final boolean hasProposal = contest != null && proposal != null; final ContestType contestType = contest != null ? ContestClientUtil.getContestType(contest.getContestTypeId()) : null; switch(tag.nodeName()) { case COLAB_NAME_PLACEHOLDER: return new TextNode(ConfigurationAttributeKey.COLAB_NAME.get(), ""); case COLAB_URL_PLACEHOLDER: return new TextNode(ConfigurationAttributeKey.COLAB_URL.get(), ""); case COLAB_ADMIN_EMAIL_PLACEHOLDER: return new TextNode(ConfigurationAttributeKey.ADMIN_EMAIL.get(), ""); case FIRSTNAME_PLACEHOLDER: return new TextNode(getRecipient().getFirstName(), ""); case FULL_NAME_PLACEHOLDER: return new TextNode(getRecipient().getFullName(), ""); case CONTEST_LINK_PLACEHOLDER: if (contest != null) { return parseXmlNode(getContestLink(contest)); } break; case PROPOSAL_LINK_PLACEHOLDER: if (hasProposal) { final String tab = tag.hasAttr("tab") ? tag.attr("tab") : null; final String linkText; if (StringUtils.isNotBlank(tag.ownText())) { linkText = tag.ownText(); } else { linkText = getProposalAttributeHelper().getAttributeValueString(ProposalAttributeKeys.NAME, ""); } return parseXmlNode(getProposalLinkWithLinkText(contest, proposal, linkText, tab)); } break; case PROPOSAL_STRING_PLACEHOLDER: if (contest != null && contestType != null) { return new TextNode(contestType.getProposalName(), ""); } break; case PROPOSALS_STRING_PLACEHOLDER: if (contest != null && contestType != null) { return new TextNode(contestType.getProposalNamePlural(), ""); } break; case CONTEST_STRING_PLACEHOLDER: if (contest != null && contestType != null) { return new TextNode(contestType.getContestName(), ""); } break; case CONTESTS_STRING_PLACEHOLDER: if (contest != null && contestType != null) { return new TextNode(contestType.getContestNamePlural(), ""); } break; case TWITTER_PLACEHOLDER: if (hasProposal) { return parseXmlNode(getTwitterShareLink(getProposalLinkUrl(contest, proposal), tag.ownText())); } break; case PINTEREST_PLACEHOLDER: if (hasProposal) { return parseXmlNode(getPinterestShareLink(getProposalLinkUrl(contest, proposal), tag.ownText())); } break; case FACEBOOK_PLACEHOLDER: if (hasProposal) { return parseXmlNode(getFacebookShareLink(getProposalLinkUrl(contest, proposal))); } break; case LINKEDIN_PLACEHOLDER: if (hasProposal) { return parseXmlNode(getLinkedInShareLink(getProposalLinkUrl(contest, proposal), tag.attr("title"), tag.ownText())); } break; default: } return null; }
Example 23
| Project: iee-master File: TextPadParser.java View source code |
@Override
public void head(org.jsoup.nodes.Node node, int depth) {
INode newNode;
if (node instanceof org.jsoup.nodes.TextNode) {
newNode = new Text().setText(((org.jsoup.nodes.TextNode) node).text());
} else if (node instanceof org.jsoup.nodes.Element) {
Span span = new Span();
if (node.hasAttr("style")) {
TextStyle style = span.getStyle();
try {
CSSStyleDeclaration styleDecl = parser.parseStyleDeclaration(new InputSource(new StringReader(node.attr("style"))));
if ("italic".equals(styleDecl.getPropertyValue("font-style"))) {
style.setItalic(true);
} else {
style.setItalic(false);
}
if ("bold".equals(styleDecl.getPropertyValue("font-weight"))) {
style.setBold(true);
} else {
style.setBold(false);
}
if (styleDecl.getPropertyValue("font-family") != null) {
style.setFont(styleDecl.getPropertyValue("font-family"));
}
if (!Strings.isNullOrEmpty(styleDecl.getPropertyValue("font-size"))) {
style.setFontSize(new Integer(styleDecl.getPropertyValue("font-size")));
}
if (!Strings.isNullOrEmpty(styleDecl.getPropertyValue("color"))) {
RGBColor rgbColorValue = ((CSSPrimitiveValue) styleDecl.getPropertyCSSValue("color")).getRGBColorValue();
Color fg = new Color((int) rgbColorValue.getRed().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getGreen().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getBlue().getFloatValue(CSSPrimitiveValue.CSS_NUMBER));
style.setFgColor(fg);
}
if (!Strings.isNullOrEmpty(styleDecl.getPropertyValue("background-color"))) {
RGBColor rgbColorValue = ((CSSPrimitiveValue) styleDecl.getPropertyCSSValue("background-color")).getRGBColorValue();
Color bg = new Color((int) rgbColorValue.getRed().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getGreen().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getBlue().getFloatValue(CSSPrimitiveValue.CSS_NUMBER));
style.setBgColor(bg);
}
} catch (IOException e) {
e.printStackTrace();
}
}
newNode = span;
} else {
newNode = new Span();
}
stack.push(newNode);
}Example 24
| Project: blade.tools-master File: MarkdownParser.java View source code |
private static Map<String, String> parseHtml(String html) {
Map<String, String> retval = new HashMap<>();
Document document = Jsoup.parse(html);
Elements elements = document.select("a[href] > h3");
for (Element h3 : elements) {
Element a = h3.parent();
int index = a.siblingIndex();
List<Node> siblings = a.siblingNodes();
StringBuilder sb = new StringBuilder();
List<Node> interesting = new ArrayList<>();
for (int i = index; i < siblings.size(); i++) {
Node sibling = siblings.get(i);
if (sibling.toString().startsWith("<hr")) {
break;
} else {
interesting.add(sibling);
}
}
for (Node node : interesting) {
sb.append(node.toString());
}
String href = a.attr("href");
retval.put(href, sb.toString());
}
return retval;
}Example 25
| Project: java-autolinker-master File: UrlAutoLinkerTest.java View source code |
@Test
public void createLinksShouldWork() {
final UrlAutoLinker autoLinker = new UrlAutoLinker(30);
List<Node> result;
Element a;
result = autoLinker.createLinks(new TextNode("das ist ein test ohne urls", ""));
Assert.assertTrue(result.size() == 1);
Assert.assertTrue(result.get(0) instanceof TextNode);
Assert.assertEquals("das ist ein test ohne urls", ((TextNode) result.get(0)).getWholeText());
result = autoLinker.createLinks(new TextNode("das ist eine url ohne twitter.com ohne protocoll", ""));
Assert.assertTrue(result.size() == 3);
Assert.assertTrue(result.get(0) instanceof TextNode);
Assert.assertEquals("das ist eine url ohne ", ((TextNode) result.get(0)).getWholeText());
Assert.assertTrue(result.get(1) instanceof Element);
a = (Element) result.get(1);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("http://twitter.com", a.attr("href"));
Assert.assertEquals("http://twitter.com", a.attr("title"));
Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
Assert.assertTrue(result.get(2) instanceof TextNode);
Assert.assertEquals(" ohne protocoll", ((TextNode) result.get(2)).getWholeText());
result = autoLinker.createLinks(new TextNode("twitter.com ohne protocoll am anfang", ""));
Assert.assertTrue(result.size() == 2);
Assert.assertTrue(result.get(0) instanceof Element);
a = (Element) result.get(0);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("http://twitter.com", a.attr("href"));
Assert.assertEquals("http://twitter.com", a.attr("title"));
Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
Assert.assertTrue(result.get(1) instanceof TextNode);
Assert.assertEquals(" ohne protocoll am anfang", ((TextNode) result.get(1)).getWholeText());
result = autoLinker.createLinks(new TextNode("twitter.com ohne protocoll am anfang mit am ende https://dailyfratze.de?foo=bar", ""));
Assert.assertTrue(result.size() == 3);
Assert.assertTrue(result.get(0) instanceof Element);
a = (Element) result.get(0);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("http://twitter.com", a.attr("href"));
Assert.assertEquals("http://twitter.com", a.attr("title"));
Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
Assert.assertTrue(result.get(1) instanceof TextNode);
Assert.assertEquals(" ohne protocoll am anfang mit am ende ", ((TextNode) result.get(1)).getWholeText());
Assert.assertTrue(result.get(2) instanceof Element);
a = (Element) result.get(2);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("href"));
Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("title"));
Assert.assertEquals("dailyfratze.de?foo=bar", ((TextNode) a.childNode(0)).getWholeText());
Assert.assertTrue(result.get(2) instanceof Element);
a = (Element) result.get(2);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("href"));
Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("title"));
Assert.assertEquals("dailyfratze.de?foo=bar", ((TextNode) a.childNode(0)).getWholeText());
result = autoLinker.createLinks(new TextNode("das ist eine url ohne https://dailyfratze.de/app/tags/CoStarring/Anton#taggedPictures ohne protocoll", ""));
Assert.assertTrue(result.get(1) instanceof Element);
a = (Element) result.get(1);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("https://dailyfratze.de/app/tags/CoStarring/Anton#taggedPictures", a.attr("href"));
Assert.assertEquals("https://dailyfratze.de/app/tags/CoStarring/Anton#taggedPictures", a.attr("title"));
Assert.assertEquals("dailyfratze.de/app/tags/CoSta…", ((TextNode) a.childNode(0)).getWholeText());
result = autoLinker.createLinks(new TextNode(" twitter.com ohne protocoll am anfang mit am ende https://dailyfratze.de?foo=bar ", ""));
Assert.assertTrue(result.size() == 5);
Assert.assertTrue(result.get(0) instanceof TextNode);
Assert.assertEquals(" ", ((TextNode) result.get(0)).getWholeText());
Assert.assertTrue(result.get(1) instanceof Element);
a = (Element) result.get(1);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("http://twitter.com", a.attr("href"));
Assert.assertEquals("http://twitter.com", a.attr("title"));
Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
Assert.assertTrue(result.get(2) instanceof TextNode);
Assert.assertEquals(" ohne protocoll am anfang mit am ende ", ((TextNode) result.get(2)).getWholeText());
Assert.assertTrue(result.get(3) instanceof Element);
a = (Element) result.get(3);
Assert.assertEquals("a", a.tagName());
Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("href"));
Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("title"));
Assert.assertEquals("dailyfratze.de?foo=bar", ((TextNode) a.childNode(0)).getWholeText());
Assert.assertTrue(result.get(4) instanceof TextNode);
Assert.assertEquals(" ", ((TextNode) result.get(4)).getWholeText());
}Example 26
| Project: nate-master File: JsoupBackedNateElement.java View source code |
@Override
public void replaceChildren(NateDocument newChildrenSource) {
verifyState();
removeChildren();
if (!(newChildrenSource instanceof JsoupBackedNateDocumentFragment)) {
throw new IllegalStateException("Internal Error. Expected JsoupBackedNateDocumentFragment, but got: " + newChildrenSource);
}
Collection<Node> newChildren = ((JsoupBackedAbstractNode) newChildrenSource).getJsoupNodes();
for (Node node : newChildren) {
this.element.appendChild(node.clone());
}
}Example 27
| Project: sitebricks-master File: HtmlTemplateCompiler.java View source code |
/**
* Walks the DOM recursively, and converts elements into corresponding sitebricks widgets.
*/
@NotNull
private <N extends Node> WidgetChain walk(PageCompilingContext pc, N node) {
WidgetChain widgetChain = Chains.proceeding();
for (Node n : node.childNodes()) {
if (n instanceof Element) {
final Element child = (Element) n;
//push form if this is a form tag
if (child.tagName().equals("form"))
pc.form = (Element) n;
//setup a lexical scope if we're going into a repeat widget (by reading the previous node)
final boolean shouldPopScope = lexicalClimb(pc, child);
//continue recursing down, perform a post-order, depth-first traversal of the DOM
WidgetChain childsChildren;
try {
childsChildren = walk(pc, child);
//process the widget itself into a Renderable with child tree
widgetChain.addWidget(widgetize(pc, child, childsChildren));
} finally {
lexicalDescend(pc, child, shouldPopScope);
}
} else if (n instanceof TextNode) {
TextNode child = (TextNode) n;
Renderable textWidget;
//setup a lexical scope if we're going into a repeat widget (by reading the previous node)
final boolean shouldPopScope = lexicalClimb(pc, child);
// construct the text widget
try {
textWidget = registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek());
// if there are no annotations, add the text widget to the chain
if (!child.hasAttr(ANNOTATION_KEY)) {
widgetChain.addWidget(textWidget);
} else {
// construct a new widget chain for this text node
WidgetChain childsChildren = Chains.proceeding().addWidget(textWidget);
// make a new widget for the annotation, making the text chain the child
String widgetName = child.attr(ANNOTATION_KEY).toLowerCase();
Renderable annotationWidget = registry.newWidget(widgetName, child.attr(ANNOTATION_CONTENT), childsChildren, pc.lexicalScopes.peek());
widgetChain.addWidget(annotationWidget);
}
} catch (ExpressionCompileException e) {
pc.errors.add(CompileError.in(node.outerHtml()).near(line(n)).causedBy(e));
}
if (shouldPopScope)
pc.lexicalScopes.pop();
} else if ((n instanceof Comment) || (n instanceof DataNode)) {
//process as raw text widget
try {
widgetChain.addWidget(registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek()));
} catch (ExpressionCompileException e) {
pc.errors.add(CompileError.in(node.outerHtml()).near(line(node)).causedBy(e));
}
} else if (n instanceof XmlDeclaration) {
try {
widgetChain.addWidget(registry.xmlDirectiveWidget(((XmlDeclaration) n).getWholeDeclaration(), pc.lexicalScopes.peek()));
} catch (ExpressionCompileException e) {
pc.errors.add(CompileError.in(node.outerHtml()).near(line(node)).causedBy(e));
}
}
}
//return computed chain, or a terminal
return widgetChain;
}Example 28
| Project: Tanaguru-master File: HTMLJsoupCleanerImpl.java View source code |
/**
* Remove the comments of the page
*
* @param node
*/
private void removeComments(Node node) {
// as we are removing child nodes while iterating, we cannot use a normal foreach over children,
// or will get a concurrent list modification error.
int i = 0;
while (i < node.childNodes().size()) {
Node child = node.childNode(i);
if (child.nodeName().equals("#comment"))
child.remove();
else {
removeComments(child);
i++;
}
}
}Example 29
| Project: storm-crawler-master File: JSoupDOMBuilder.java View source code |
/**
* The internal helper that copies content from the specified Jsoup
* <tt>Node</tt> into a W3C {@link Node}.
*
* @param node
* The Jsoup node containing the content to copy to the specified
* W3C {@link Node}.
* @param out
* The W3C {@link Node} that receives the DOM content.
*/
public static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) {
if (node instanceof org.jsoup.nodes.Document) {
org.jsoup.nodes.Document d = (org.jsoup.nodes.Document) node;
for (org.jsoup.nodes.Node n : d.childNodes()) {
createDOM(n, out, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element e = (org.jsoup.nodes.Element) node;
org.w3c.dom.Element _e = doc.createElement(e.tagName());
out.appendChild(_e);
org.jsoup.nodes.Attributes atts = e.attributes();
for (org.jsoup.nodes.Attribute a : atts) {
String attName = a.getKey();
// omit xhtml namespace
if (attName.equals("xmlns")) {
continue;
}
String attPrefix = getNSPrefix(attName);
if (attPrefix != null) {
if (attPrefix.equals("xmlns")) {
ns.put(getLocalName(attName), a.getValue());
} else if (!attPrefix.equals("xml")) {
String namespace = ns.get(attPrefix);
if (namespace == null) {
// fix attribute names looking like qnames
attName = attName.replace(':', '_');
}
}
}
_e.setAttribute(attName, a.getValue());
}
for (org.jsoup.nodes.Node n : e.childNodes()) {
createDOM(n, _e, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode t = (org.jsoup.nodes.TextNode) node;
if (!(out instanceof Document)) {
out.appendChild(doc.createTextNode(t.text()));
}
} else if (node instanceof org.jsoup.nodes.Comment) {
if (!(out instanceof Document)) {
org.jsoup.nodes.Comment comment = (org.jsoup.nodes.Comment) node;
out.appendChild(doc.createComment(comment.getData()));
}
} else if (node instanceof org.jsoup.nodes.DataNode) {
if (!(out instanceof Document)) {
org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) node;
String whole = sourceData.getWholeData();
out.appendChild(doc.createTextNode(whole));
}
}
}Example 30
| Project: web-crawler-master File: JSoupDOMBuilder.java View source code |
/**
* The internal helper that copies content from the specified Jsoup
* <tt>Node</tt> into a W3C {@link Node}.
*
* @param node
* The Jsoup node containing the content to copy to the specified
* W3C {@link Node}.
* @param out
* The W3C {@link Node} that receives the DOM content.
*/
public static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) {
if (node instanceof org.jsoup.nodes.Document) {
org.jsoup.nodes.Document d = (org.jsoup.nodes.Document) node;
for (org.jsoup.nodes.Node n : d.childNodes()) {
createDOM(n, out, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element e = (org.jsoup.nodes.Element) node;
org.w3c.dom.Element _e = doc.createElement(e.tagName());
out.appendChild(_e);
org.jsoup.nodes.Attributes atts = e.attributes();
for (org.jsoup.nodes.Attribute a : atts) {
String attName = a.getKey();
// omit xhtml namespace
if (attName.equals("xmlns")) {
continue;
}
String attPrefix = getNSPrefix(attName);
if (attPrefix != null) {
if (attPrefix.equals("xmlns")) {
ns.put(getLocalName(attName), a.getValue());
} else if (!attPrefix.equals("xml")) {
String namespace = ns.get(attPrefix);
if (namespace == null) {
// fix attribute names looking like qnames
attName = attName.replace(':', '_');
}
}
}
_e.setAttribute(attName, a.getValue());
}
for (org.jsoup.nodes.Node n : e.childNodes()) {
createDOM(n, _e, doc, ns);
}
} else if (node instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode t = (org.jsoup.nodes.TextNode) node;
if (!(out instanceof Document)) {
out.appendChild(doc.createTextNode(t.text()));
}
} else if (node instanceof org.jsoup.nodes.Comment) {
if (!(out instanceof Document)) {
org.jsoup.nodes.Comment comment = (org.jsoup.nodes.Comment) node;
out.appendChild(doc.createComment(comment.getData()));
}
} else if (node instanceof org.jsoup.nodes.DataNode) {
if (!(out instanceof Document)) {
org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) node;
String whole = sourceData.getWholeData();
out.appendChild(doc.createTextNode(whole));
}
}
}Example 31
| Project: asta4d-master File: Asta4DTagSupportHtmlTreeBuilderState.java View source code |
boolean process(Token t, Asta4DTagSupportHtmlTreeBuilder tb) {
switch(t.type) {
case Character:
{
Token.Character c = t.asCharacter();
if (c.getData().equals(nullString)) {
// todo confirm that check
tb.error(this);
return false;
} else if (tb.framesetOk() && isWhitespace(c)) {
// don't check if whitespace if frames already closed
tb.reconstructFormattingElements();
tb.insert(c);
} else {
tb.reconstructFormattingElements();
tb.insert(c);
tb.framesetOk(false);
}
break;
}
case Comment:
{
tb.insert(t.asComment());
break;
}
case Doctype:
{
tb.error(this);
return false;
}
case StartTag:
Token.StartTag startTag = t.asStartTag();
String name = startTag.name();
if (name.equals("html")) {
tb.error(this);
// merge attributes onto real html
Element html = tb.getStack().getFirst();
for (Attribute attribute : startTag.getAttributes()) {
if (!html.hasAttr(attribute.getKey()))
html.attributes().put(attribute);
}
} else if (StringUtil.in(name, Constants.InBodyStartToHead)) {
return tb.process(t, InHead);
} else if (name.equals("body")) {
tb.error(this);
LinkedList<Element> stack = tb.getStack();
if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) {
// ignore
return false;
} else {
tb.framesetOk(false);
Element body = stack.get(1);
for (Attribute attribute : startTag.getAttributes()) {
if (!body.hasAttr(attribute.getKey()))
body.attributes().put(attribute);
}
}
} else if (name.equals("frameset")) {
tb.error(this);
LinkedList<Element> stack = tb.getStack();
if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) {
// ignore
return false;
} else if (!tb.framesetOk()) {
// ignore frameset
return false;
} else {
Element second = stack.get(1);
if (second.parent() != null)
second.remove();
// pop up to html element
while (stack.size() > 1) stack.removeLast();
tb.insert(startTag);
tb.transition(InFrameset);
}
} else if (StringUtil.in(name, Constants.InBodyStartPClosers)) {
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insert(startTag);
} else if (StringUtil.in(name, Constants.Headings)) {
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
if (StringUtil.in(tb.currentElement().nodeName(), Constants.Headings)) {
tb.error(this);
tb.pop();
}
tb.insert(startTag);
} else if (StringUtil.in(name, Constants.InBodyStartPreListing)) {
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insert(startTag);
// todo: ignore LF if next token
tb.framesetOk(false);
} else if (name.equals("form")) {
if (tb.getFormElement() != null) {
tb.error(this);
return false;
}
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insertForm(startTag, true);
} else if (name.equals("li")) {
tb.framesetOk(false);
LinkedList<Element> stack = tb.getStack();
for (int i = stack.size() - 1; i > 0; i--) {
Element el = stack.get(i);
if (el.nodeName().equals("li")) {
tb.process(new Token.EndTag("li"));
break;
}
if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), Constants.InBodyStartLiBreakers))
break;
}
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insert(startTag);
} else if (StringUtil.in(name, Constants.DdDt)) {
tb.framesetOk(false);
LinkedList<Element> stack = tb.getStack();
for (int i = stack.size() - 1; i > 0; i--) {
Element el = stack.get(i);
if (StringUtil.in(el.nodeName(), Constants.DdDt)) {
tb.process(new Token.EndTag(el.nodeName()));
break;
}
if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), Constants.InBodyStartLiBreakers))
break;
}
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insert(startTag);
} else if (name.equals("plaintext")) {
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insert(startTag);
// once in, never gets out
tb.tokeniser.transition(TokeniserState.PLAINTEXT);
} else if (name.equals("button")) {
if (tb.inButtonScope("button")) {
// close and reprocess
tb.error(this);
tb.process(new Token.EndTag("button"));
tb.process(startTag);
} else {
tb.reconstructFormattingElements();
tb.insert(startTag);
tb.framesetOk(false);
}
} else if (name.equals("a")) {
if (tb.getActiveFormattingElement("a") != null) {
tb.error(this);
tb.process(new Token.EndTag("a"));
// still on stack?
Element remainingA = tb.getFromStack("a");
if (remainingA != null) {
tb.removeFromActiveFormattingElements(remainingA);
tb.removeFromStack(remainingA);
}
}
tb.reconstructFormattingElements();
Element a = tb.insert(startTag);
tb.pushActiveFormattingElements(a);
} else if (StringUtil.in(name, Constants.Formatters)) {
tb.reconstructFormattingElements();
Element el = tb.insert(startTag);
tb.pushActiveFormattingElements(el);
} else if (name.equals("nobr")) {
tb.reconstructFormattingElements();
if (tb.inScope("nobr")) {
tb.error(this);
tb.process(new Token.EndTag("nobr"));
tb.reconstructFormattingElements();
}
Element el = tb.insert(startTag);
tb.pushActiveFormattingElements(el);
} else if (StringUtil.in(name, Constants.InBodyStartApplets)) {
tb.reconstructFormattingElements();
tb.insert(startTag);
tb.insertMarkerToFormattingElements();
tb.framesetOk(false);
} else if (name.equals("table")) {
if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insert(startTag);
tb.framesetOk(false);
tb.transition(InTable);
} else if (StringUtil.in(name, Constants.InBodyStartEmptyFormatters)) {
tb.reconstructFormattingElements();
tb.insertEmpty(startTag);
tb.framesetOk(false);
} else if (name.equals("input")) {
tb.reconstructFormattingElements();
Element el = tb.insertEmpty(startTag);
if (!el.attr("type").equalsIgnoreCase("hidden"))
tb.framesetOk(false);
} else if (StringUtil.in(name, Constants.InBodyStartMedia)) {
tb.insertEmpty(startTag);
} else if (name.equals("hr")) {
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.insertEmpty(startTag);
tb.framesetOk(false);
} else if (name.equals("image")) {
if (tb.getFromStack("svg") == null)
// change <image> to <img>, unless in svg
return tb.process(startTag.name("img"));
else
tb.insert(startTag);
} else if (name.equals("isindex")) {
// how much do we care about the early 90s?
tb.error(this);
if (tb.getFormElement() != null)
return false;
tb.tokeniser.acknowledgeSelfClosingFlag();
tb.process(new Token.StartTag("form"));
if (startTag.attributes.hasKey("action")) {
Element form = tb.getFormElement();
form.attr("action", startTag.attributes.get("action"));
}
tb.process(new Token.StartTag("hr"));
tb.process(new Token.StartTag("label"));
// hope you like english.
String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes.get("prompt") : "This is a searchable index. Enter search keywords: ";
tb.process(new Token.Character(prompt));
// input
Attributes inputAttribs = new Attributes();
for (Attribute attr : startTag.attributes) {
if (!StringUtil.in(attr.getKey(), Constants.InBodyStartInputAttribs))
inputAttribs.put(attr);
}
inputAttribs.put("name", "isindex");
tb.process(new Token.StartTag("input", inputAttribs));
tb.process(new Token.EndTag("label"));
tb.process(new Token.StartTag("hr"));
tb.process(new Token.EndTag("form"));
} else if (name.equals("textarea")) {
tb.insert(startTag);
// todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next
// one. (Newlines at the start of textarea elements are ignored as an authoring convenience.)
tb.tokeniser.transition(TokeniserState.Rcdata);
tb.markInsertionMode();
tb.framesetOk(false);
tb.transition(Text);
} else if (name.equals("xmp")) {
if (tb.inButtonScope("p")) {
tb.process(new Token.EndTag("p"));
}
tb.reconstructFormattingElements();
tb.framesetOk(false);
handleRawtext(startTag, tb);
} else if (name.equals("iframe")) {
tb.framesetOk(false);
handleRawtext(startTag, tb);
} else if (name.equals("noembed")) {
// also handle noscript if script enabled
handleRawtext(startTag, tb);
} else if (name.equals("select")) {
tb.reconstructFormattingElements();
tb.insert(startTag);
tb.framesetOk(false);
Asta4DTagSupportHtmlTreeBuilderState state = tb.state();
if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell))
tb.transition(InSelectInTable);
else
tb.transition(InSelect);
} else if (StringUtil.in(name, Constants.InBodyStartOptions)) {
if (tb.currentElement().nodeName().equals("option"))
tb.process(new Token.EndTag("option"));
tb.reconstructFormattingElements();
tb.insert(startTag);
} else if (StringUtil.in(name, Constants.InBodyStartRuby)) {
if (tb.inScope("ruby")) {
tb.generateImpliedEndTags();
if (!tb.currentElement().nodeName().equals("ruby")) {
tb.error(this);
// i.e. close up to but not include name
tb.popStackToBefore("ruby");
}
tb.insert(startTag);
}
} else if (name.equals("math")) {
tb.reconstructFormattingElements();
// todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml)
tb.insert(startTag);
tb.tokeniser.acknowledgeSelfClosingFlag();
} else if (name.equals("svg")) {
tb.reconstructFormattingElements();
// todo: handle A start tag whose tag name is "svg" (xlink, svg)
tb.insert(startTag);
tb.tokeniser.acknowledgeSelfClosingFlag();
} else if (StringUtil.in(name, Constants.InBodyStartDrop)) {
tb.error(this);
return false;
} else {
tb.reconstructFormattingElements();
tb.insert(startTag);
}
break;
case EndTag:
Token.EndTag endTag = t.asEndTag();
name = endTag.name();
if (name.equals("body")) {
if (!tb.inScope("body")) {
tb.error(this);
return false;
} else {
// todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead,
// tr, body, html
tb.transition(AfterBody);
}
} else if (name.equals("html")) {
boolean notIgnored = tb.process(new Token.EndTag("body"));
if (notIgnored)
return tb.process(endTag);
} else if (StringUtil.in(name, Constants.InBodyEndClosers)) {
if (!tb.inScope(name)) {
// nothing to close
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags();
if (!tb.currentElement().nodeName().equals(name))
tb.error(this);
tb.popStackToClose(name);
}
} else if (name.equals("form")) {
Element currentForm = tb.getFormElement();
tb.setFormElement(null);
if (currentForm == null || !tb.inScope(name)) {
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags();
if (!tb.currentElement().nodeName().equals(name))
tb.error(this);
// remove currentForm from stack. will shift anything under up.
tb.removeFromStack(currentForm);
}
} else if (name.equals("p")) {
if (!tb.inButtonScope(name)) {
tb.error(this);
// if no p to close, creates an empty <p></p>
tb.process(new Token.StartTag(name));
return tb.process(endTag);
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElement().nodeName().equals(name))
tb.error(this);
tb.popStackToClose(name);
}
} else if (name.equals("li")) {
if (!tb.inListItemScope(name)) {
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElement().nodeName().equals(name))
tb.error(this);
tb.popStackToClose(name);
}
} else if (StringUtil.in(name, Constants.DdDt)) {
if (!tb.inScope(name)) {
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElement().nodeName().equals(name))
tb.error(this);
tb.popStackToClose(name);
}
} else if (StringUtil.in(name, Constants.Headings)) {
if (!tb.inScope(Constants.Headings)) {
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElement().nodeName().equals(name))
tb.error(this);
tb.popStackToClose(Constants.Headings);
}
} else if (name.equals("sarcasm")) {
// *sigh*
return anyOtherEndTag(t, tb);
} else if (StringUtil.in(name, Constants.InBodyEndAdoptionFormatters)) {
// Adoption Agency Algorithm.
OUTER: for (int i = 0; i < 8; i++) {
Element formatEl = tb.getActiveFormattingElement(name);
if (formatEl == null)
return anyOtherEndTag(t, tb);
else if (!tb.onStack(formatEl)) {
tb.error(this);
tb.removeFromActiveFormattingElements(formatEl);
return true;
} else if (!tb.inScope(formatEl.nodeName())) {
tb.error(this);
return false;
} else if (tb.currentElement() != formatEl)
tb.error(this);
Element furthestBlock = null;
Element commonAncestor = null;
boolean seenFormattingElement = false;
LinkedList<Element> stack = tb.getStack();
// the spec doesn't limit to < 64, but in degenerate cases (9000+ stack depth) this prevents
// run-aways
final int stackSize = stack.size();
for (int si = 0; si < stackSize && si < 64; si++) {
Element el = stack.get(si);
if (el == formatEl) {
commonAncestor = stack.get(si - 1);
seenFormattingElement = true;
} else if (seenFormattingElement && tb.isSpecial(el)) {
furthestBlock = el;
break;
}
}
if (furthestBlock == null) {
tb.popStackToClose(formatEl.nodeName());
tb.removeFromActiveFormattingElements(formatEl);
return true;
}
// todo: Let a bookmark note the position of the formatting element in the list of active formatting elements
// relative to the elements on either side of it in the list.
// does that mean: int pos of format el in list?
Element node = furthestBlock;
Element lastNode = furthestBlock;
INNER: for (int j = 0; j < 3; j++) {
if (tb.onStack(node))
node = tb.aboveOnStack(node);
if (!tb.isInActiveFormattingElements(node)) {
// note no bookmark check
tb.removeFromStack(node);
continue INNER;
} else if (node == formatEl)
break INNER;
Element replacement = new Element(Tag.valueOf(node.nodeName()), tb.getBaseUri());
tb.replaceActiveFormattingElement(node, replacement);
tb.replaceOnStack(node, replacement);
node = replacement;
if (lastNode == furthestBlock) {
// todo: move the aforementioned bookmark to be immediately after the new node in the list of active
// formatting elements.
// not getting how this bookmark both straddles the element above, but is inbetween here...
}
if (lastNode.parent() != null)
lastNode.remove();
node.appendChild(lastNode);
lastNode = node;
}
if (StringUtil.in(commonAncestor.nodeName(), Constants.InBodyEndTableFosters)) {
if (lastNode.parent() != null)
lastNode.remove();
tb.insertInFosterParent(lastNode);
} else {
if (lastNode.parent() != null)
lastNode.remove();
commonAncestor.appendChild(lastNode);
}
Element adopter = new Element(formatEl.tag(), tb.getBaseUri());
adopter.attributes().addAll(formatEl.attributes());
Node[] childNodes = furthestBlock.childNodes().toArray(new Node[furthestBlock.childNodeSize()]);
for (Node childNode : childNodes) {
// append will reparent. thus the clone to avoid concurrent mod.
adopter.appendChild(childNode);
}
furthestBlock.appendChild(adopter);
tb.removeFromActiveFormattingElements(formatEl);
// todo: insert the new element into the list of active formatting elements at the position of the aforementioned
// bookmark.
tb.removeFromStack(formatEl);
tb.insertOnStackAfter(furthestBlock, adopter);
}
} else if (StringUtil.in(name, Constants.InBodyStartApplets)) {
if (!tb.inScope("name")) {
if (!tb.inScope(name)) {
tb.error(this);
return false;
}
tb.generateImpliedEndTags();
if (!tb.currentElement().nodeName().equals(name))
tb.error(this);
tb.popStackToClose(name);
tb.clearFormattingElementsToLastMarker();
}
} else if (name.equals("br")) {
tb.error(this);
tb.process(new Token.StartTag("br"));
return false;
} else {
return anyOtherEndTag(t, tb);
}
break;
case EOF:
// stop parsing
break;
}
return true;
}Example 32
| Project: baleen-master File: NewLineToNewParagraph.java View source code |
/**
* Collect tags which are on the same line (unbroken by BRs)
*
* @param document the document
* @param e the e
* @return the list
*/
private List<Element> collectRuns(Document document, Element e) {
List<Element> runs = new LinkedList<>();
Element run = null;
for (Node c : e.childNodesCopy()) {
if (c instanceof Element && ("br".equalsIgnoreCase(((Element) c).tagName()))) {
// If we hit a br then add the old run and start a new one
if (run != null) {
runs.add(run);
run = null;
}
} else {
// If not a br then add this node to the other
if (run == null) {
run = document.createElement("p");
}
run.appendChild(c);
}
}
// Add the last run
if (run != null) {
runs.add(run);
}
return runs;
}Example 33
| Project: facelets-lite-master File: Test.java View source code |
String toNormalHtml(Document doc) {
doc.normalise();
doc.traverse(new NodeVisitor() {
@Override
public void tail(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
textNode.text(textNode.text().trim());
}
}
@Override
public void head(Node arg0, int arg1) {
}
});
return cleaner.clean(doc).html();
}Example 34
| Project: framework-master File: MenuBar.java View source code |
protected MenuItem readMenuElement(Element menuElement) {
Resource icon = null;
if (menuElement.hasAttr("icon")) {
icon = DesignAttributeHandler.getFormatter().parse(menuElement.attr("icon"), Resource.class);
}
String caption = "";
List<Element> subMenus = new ArrayList<>();
for (Node node : menuElement.childNodes()) {
if (node instanceof Element && ((Element) node).tagName().equals("menu")) {
subMenus.add((Element) node);
} else {
caption += node.toString();
}
}
MenuItem menu = new MenuItem(caption.trim(), icon, null);
Attributes attr = menuElement.attributes();
if (menuElement.hasAttr("icon")) {
menu.setIcon(DesignAttributeHandler.readAttribute("icon", attr, Resource.class));
}
if (menuElement.hasAttr("disabled")) {
menu.setEnabled(!DesignAttributeHandler.readAttribute("disabled", attr, boolean.class));
}
if (menuElement.hasAttr("visible")) {
menu.setVisible(DesignAttributeHandler.readAttribute("visible", attr, boolean.class));
}
if (menuElement.hasAttr("separator")) {
menu.setSeparator(DesignAttributeHandler.readAttribute("separator", attr, boolean.class));
}
if (menuElement.hasAttr("checkable")) {
menu.setCheckable(DesignAttributeHandler.readAttribute("checkable", attr, boolean.class));
}
if (menuElement.hasAttr("checked")) {
menu.setChecked(DesignAttributeHandler.readAttribute("checked", attr, boolean.class));
}
if (menuElement.hasAttr("description")) {
menu.setDescription(DesignAttributeHandler.readAttribute("description", attr, String.class));
}
if (menuElement.hasAttr("style-name")) {
menu.setStyleName(DesignAttributeHandler.readAttribute("style-name", attr, String.class));
}
if (!subMenus.isEmpty()) {
menu.itsChildren = new ArrayList<>();
}
for (Element subMenu : subMenus) {
MenuItem newItem = readMenuElement(subMenu);
newItem.setParent(menu);
menu.itsChildren.add(newItem);
}
return menu;
}Example 35
| Project: jinjava-master File: TruncateHtmlFilter.java View source code |
@Override
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode text = (TextNode) node;
String textContent = text.text();
if (textLen >= maxTextLen) {
text.text("");
} else if (textLen + textContent.length() > maxTextLen) {
int ptr = maxTextLen - textLen;
if (!killwords) {
ptr = Functions.movePointerToJustBeforeLastWord(ptr, textContent) - 1;
}
text.text(textContent.substring(0, ptr) + ending);
textLen = maxTextLen;
} else {
textLen += textContent.length();
}
}
}Example 36
| Project: kune-master File: ContentUnrenderer.java View source code |
// private static final Logger LOG = // Logger.getLogger(ContentUnrenderer.class.getName()); /** * Helper method to recursively parse a HTML element and construct a wave * document. * * @param parent the parent * @param output the output * @param elements the elements * @param annotations the annotations */ private static void unrender(final Node parent, final StringBuilder output, final Map<Integer, com.google.wave.api.Element> elements, final Annotations annotations) { for (final Node node : parent.childNodes()) { if (node instanceof TextNode) { output.append(((TextNode) node).text()); } else if (node instanceof Element) { final int position = output.length(); final Element element = (Element) node; final String name = element.tag().getName(); if ("p".equalsIgnoreCase(name)) { elements.put(position, new Line()); // handle any attributes? } // Additional HTML element tags here. unrender(element, output, elements, annotations); } } }
Example 37
| Project: ScreenSlicer-master File: Scrape.java View source code |
private static String getHelper(final Browser browser, final boolean throttle, final Node urlNode, final String url, final boolean p_cached, final String runGuid, final boolean toNewWindow, final boolean init, final HtmlNode[] postFetchClicks) {
if (!CommonUtil.isEmpty(url) || urlNode != null) {
final Object resultLock = new Object();
final String initVal;
final String[] result;
synchronized (resultLock) {
initVal = Random.next();
result = new String[] { initVal };
}
final AtomicBoolean started = new AtomicBoolean();
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
boolean terminate = false;
started.set(true);
boolean cached = p_cached;
String newHandle = null;
String origHandle = null;
try {
origHandle = browser.getWindowHandle();
String content = null;
if (!cached) {
try {
BrowserUtil.get(browser, url, urlNode, false, toNewWindow, init);
} catch (Browser.Retry r) {
terminate = true;
throw r;
} catch (Browser.Fatal f) {
terminate = true;
throw f;
} catch (Throwable t) {
if (urlNode != null) {
BrowserUtil.newWindow(browser, init);
}
BrowserUtil.get(browser, url, false, init);
}
if (urlNode != null) {
newHandle = browser.getWindowHandle();
}
BrowserUtil.doClicks(browser, postFetchClicks, null, null);
content = browser.getPageSource();
if (WebApp.DEBUG && (postFetchClicks == null || postFetchClicks.length == 0)) {
try {
long filename = System.currentTimeMillis();
Files.copy(browser.getScreenshotAs(OutputType.FILE), new File("./" + filename + ".log.scrape.png"));
FileUtils.writeStringToFile(new File("./" + filename + ".log.scrape.htm"), content, "utf-8");
} catch (IOException e) {
}
}
if (CommonUtil.isEmpty(content)) {
cached = true;
}
}
if (cached) {
if (ScreenSlicerBatch.isCancelled(runGuid)) {
return;
}
try {
BrowserUtil.get(browser, toCacheUrl(url, false), false, init);
} catch (Browser.Retry r) {
terminate = true;
throw r;
} catch (Browser.Fatal f) {
terminate = true;
throw f;
} catch (Throwable t) {
BrowserUtil.get(browser, toCacheUrl(url, true), false, init);
}
content = browser.getPageSource();
}
content = NodeUtil.clean(content, browser.getCurrentUrl()).outerHtml();
// }
synchronized (resultLock) {
result[0] = content;
}
} catch (Browser.Retry r) {
terminate = true;
throw r;
} catch (Browser.Fatal f) {
terminate = true;
throw f;
} catch (Throwable t) {
Log.exception(t);
} finally {
synchronized (resultLock) {
if (initVal.equals(result[0])) {
result[0] = null;
}
}
if (!terminate) {
BrowserUtil.browserSleepLong(throttle);
if (init && newHandle != null && origHandle != null) {
try {
BrowserUtil.handleNewWindows(browser, origHandle, true);
} catch (Browser.Retry r) {
throw r;
} catch (Browser.Fatal f) {
throw f;
} catch (Throwable t) {
Log.exception(t);
}
}
}
}
}
});
thread.start();
try {
while (!started.get()) {
try {
Thread.sleep(WAIT);
} catch (Throwable t) {
}
}
thread.join(HANG_TIME);
synchronized (resultLock) {
if (initVal.equals(result[0])) {
Log.exception(new Exception("Browser is hanging"));
try {
thread.interrupt();
} catch (Throwable t) {
Log.exception(t);
}
throw new Browser.Retry();
}
return result[0];
}
} catch (Browser.Retry r) {
throw r;
} catch (Browser.Fatal f) {
throw f;
} catch (Throwable t) {
Log.exception(t);
}
}
return null;
}Example 38
| Project: slicer-master File: Scrape.java View source code |
private static String getHelper(final Browser browser, final boolean throttle, final Node urlNode, final String url, final boolean p_cached, final String runGuid, final boolean toNewWindow, final boolean init, final HtmlNode[] postFetchClicks) {
if (!CommonUtil.isEmpty(url) || urlNode != null) {
final Object resultLock = new Object();
final String initVal;
final String[] result;
synchronized (resultLock) {
initVal = Random.next();
result = new String[] { initVal };
}
final AtomicBoolean started = new AtomicBoolean();
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
boolean terminate = false;
started.set(true);
boolean cached = p_cached;
String newHandle = null;
String origHandle = null;
try {
origHandle = browser.getWindowHandle();
String content = null;
if (!cached) {
try {
BrowserUtil.get(browser, url, urlNode, false, toNewWindow, init);
} catch (Browser.Retry r) {
terminate = true;
throw r;
} catch (Browser.Fatal f) {
terminate = true;
throw f;
} catch (Throwable t) {
if (urlNode != null) {
BrowserUtil.newWindow(browser, init);
}
BrowserUtil.get(browser, url, false, init);
}
if (urlNode != null) {
newHandle = browser.getWindowHandle();
}
BrowserUtil.doClicks(browser, postFetchClicks, null, null);
content = browser.getPageSource();
if (WebApp.DEBUG && (postFetchClicks == null || postFetchClicks.length == 0)) {
try {
long filename = System.currentTimeMillis();
Files.copy(browser.getScreenshotAs(OutputType.FILE), new File("./" + filename + ".log.scrape.png"));
FileUtils.writeStringToFile(new File("./" + filename + ".log.scrape.htm"), content, "utf-8");
} catch (IOException e) {
}
}
if (CommonUtil.isEmpty(content)) {
cached = true;
}
}
if (cached) {
if (ScreenSlicerBatch.isCancelled(runGuid)) {
return;
}
try {
BrowserUtil.get(browser, toCacheUrl(url, false), false, init);
} catch (Browser.Retry r) {
terminate = true;
throw r;
} catch (Browser.Fatal f) {
terminate = true;
throw f;
} catch (Throwable t) {
BrowserUtil.get(browser, toCacheUrl(url, true), false, init);
}
content = browser.getPageSource();
}
content = NodeUtil.clean(content, browser.getCurrentUrl()).outerHtml();
// }
synchronized (resultLock) {
result[0] = content;
}
} catch (Browser.Retry r) {
terminate = true;
throw r;
} catch (Browser.Fatal f) {
terminate = true;
throw f;
} catch (Throwable t) {
Log.exception(t);
} finally {
synchronized (resultLock) {
if (initVal.equals(result[0])) {
result[0] = null;
}
}
if (!terminate) {
BrowserUtil.browserSleepLong(throttle);
if (init && newHandle != null && origHandle != null) {
try {
BrowserUtil.handleNewWindows(browser, origHandle, true);
} catch (Browser.Retry r) {
throw r;
} catch (Browser.Fatal f) {
throw f;
} catch (Throwable t) {
Log.exception(t);
}
}
}
}
}
});
thread.start();
try {
while (!started.get()) {
try {
Thread.sleep(WAIT);
} catch (Throwable t) {
}
}
thread.join(HANG_TIME);
synchronized (resultLock) {
if (initVal.equals(result[0])) {
Log.exception(new Exception("Browser is hanging"));
try {
thread.interrupt();
} catch (Throwable t) {
Log.exception(t);
}
throw new Browser.Retry();
}
return result[0];
}
} catch (Browser.Retry r) {
throw r;
} catch (Browser.Fatal f) {
throw f;
} catch (Throwable t) {
Log.exception(t);
}
}
return null;
}Example 39
| Project: TuCanMobile-master File: EventsScraper.java View source code |
/**
* Gibt einzelne Events in einem ListAdapter zurück.
*
* @param content
* Content div Element
* @return ListAdapter
* @author Daniel Thiem
*/
private ListAdapter getApplicationSingleItems(Element content) {
final Element coursestatusTable = content.select("table.tbcoursestatus").first();
if (coursestatusTable != null) {
Elements moduleTable = coursestatusTable.select("tr");
ListAdapter singleEventAdapter = null;
if (moduleTable.size() > 0) {
// Einzelne Veranstaltungen werden angeboten
ArrayList<String> itemName = new ArrayList<String>();
ArrayList<String> itemInstructor = new ArrayList<String>();
ArrayList<String> itemDate = new ArrayList<String>();
ArrayList<Boolean> isModule = new ArrayList<Boolean>();
for (Element next : moduleTable) {
final Elements cols = next.select("td");
Element firstCol = cols.first();
if (firstCol != null && cols.size() == 4) {
final Element secondCol = cols.get(1);
List<Node> innerChilds = secondCol.childNodes();
if (firstCol.hasClass("tbsubhead")) {
if (innerChilds.size() == 4) {
final Node instructorNode = innerChilds.get(3);
if (instructorNode instanceof TextNode) {
String moduleInstructor = ((TextNode) instructorNode).text();
String moduleName = secondCol.select("span.eventTitle").text();
String moduleDeadline = cols.get(2).text();
itemName.add(moduleName);
itemInstructor.add(moduleInstructor);
itemDate.add(moduleDeadline);
isModule.add(true);
}
}
} else if (firstCol.hasClass("tbdata")) {
// Es handelt sich um ein Event
String eventName = null, eventInstructor = null, eventDates = null;
if (innerChilds.size() == 1) {
// Event nur mit Namen
final String evNmHtml = secondCol.html();
eventName = TucanMobile.getEventNameByString(evNmHtml);
eventInstructor = "";
eventDates = "";
} else if (innerChilds.size() == 7) {
// Event mit Vollinformationen
final Node instructorNode = innerChilds.get(4);
final Node dateNode = innerChilds.get(6);
if (instructorNode instanceof TextNode && dateNode instanceof TextNode) {
eventName = secondCol.select("span.eventTitle").text();
eventInstructor = ((TextNode) instructorNode).text().trim();
eventDates = ((TextNode) dateNode).text().trim();
}
} else if (innerChilds.size() == 5) {
// Event ohne Datum
final Node instructorNode = innerChilds.get(4);
if (instructorNode instanceof TextNode) {
eventName = secondCol.select("span.eventTitle").text();
eventInstructor = ((TextNode) instructorNode).text().trim();
eventDates = "";
}
}
itemName.add(eventName);
itemInstructor.add(eventInstructor);
itemDate.add(eventDates);
isModule.add(false);
}
}
}
// Adapter zum zurückgeben erstellen
singleEventAdapter = new HighlightedThreeLinesAdapter(context, itemName, itemInstructor, itemDate, isModule);
}
return singleEventAdapter;
}
return null;
}Example 40
| Project: vaadin-master File: MenuBar.java View source code |
protected MenuItem readMenuElement(Element menuElement) {
Resource icon = null;
if (menuElement.hasAttr("icon")) {
icon = DesignAttributeHandler.getFormatter().parse(menuElement.attr("icon"), Resource.class);
}
String caption = "";
List<Element> subMenus = new ArrayList<>();
for (Node node : menuElement.childNodes()) {
if (node instanceof Element && ((Element) node).tagName().equals("menu")) {
subMenus.add((Element) node);
} else {
caption += node.toString();
}
}
MenuItem menu = new MenuItem(caption.trim(), icon, null);
Attributes attr = menuElement.attributes();
if (menuElement.hasAttr("icon")) {
menu.setIcon(DesignAttributeHandler.readAttribute("icon", attr, Resource.class));
}
if (menuElement.hasAttr("disabled")) {
menu.setEnabled(!DesignAttributeHandler.readAttribute("disabled", attr, boolean.class));
}
if (menuElement.hasAttr("visible")) {
menu.setVisible(DesignAttributeHandler.readAttribute("visible", attr, boolean.class));
}
if (menuElement.hasAttr("separator")) {
menu.setSeparator(DesignAttributeHandler.readAttribute("separator", attr, boolean.class));
}
if (menuElement.hasAttr("checkable")) {
menu.setCheckable(DesignAttributeHandler.readAttribute("checkable", attr, boolean.class));
}
if (menuElement.hasAttr("checked")) {
menu.setChecked(DesignAttributeHandler.readAttribute("checked", attr, boolean.class));
}
if (menuElement.hasAttr("description")) {
menu.setDescription(DesignAttributeHandler.readAttribute("description", attr, String.class));
}
if (menuElement.hasAttr("style-name")) {
menu.setStyleName(DesignAttributeHandler.readAttribute("style-name", attr, String.class));
}
if (!subMenus.isEmpty()) {
menu.itsChildren = new ArrayList<>();
}
for (Element subMenu : subMenus) {
MenuItem newItem = readMenuElement(subMenu);
newItem.setParent(menu);
menu.itsChildren.add(newItem);
}
return menu;
}Example 41
| Project: web-entity-extractor-ACL2014-master File: KnowledgeTreeBuilder.java View source code |
/**
* Convert jsoup Element (= an HTML tag and its content) into a knowledge tree.
* Contents inside style tag (CSS) and script tag (JavaScript) are ignored.
*
* @param elt The jsoup Element corresponding to the root of the tree
* @param parent The parent of the created tree's root node.
*/
public void convertElementToKTree(Element elt, KNode parent) {
String eltText = LingUtils.normalize(elt.text(), opts.earlyNormalizeEntities);
KNode currentNode = parent.createChild(KNode.Type.TAG, elt.tagName(), eltText.length() > opts.maxFullTextLength ? null : eltText);
// Add children
for (Node child : elt.childNodes()) {
if (child instanceof Element) {
convertElementToKTree((Element) child, currentNode);
} else if (child instanceof TextNode) {
if (!opts.ignoreTextNodes) {
String text = LingUtils.normalize(((TextNode) child).text(), opts.earlyNormalizeEntities);
if (!text.isEmpty()) {
//currentNode.createChild(KNode.Type.TEXT, text, text);
currentNode.createChild(KNode.Type.TAG, "text", text.length() > opts.maxFullTextLength ? null : text);
}
}
}
}
// Add attributes
for (Attribute attr : elt.attributes()) {
currentNode.createAttribute(attr.getKey(), attr.getValue());
}
}Example 42
| Project: zongtui-webcrawler-master File: ElementOperator.java View source code |
@Override
public String operate(Element element) {
int index = 0;
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
if (group == 0) {
accum.append(textNode.text());
} else if (++index == group) {
return textNode.text();
}
}
}
return accum.toString();
}Example 43
| Project: android-essentials-toolbox-master File: GenerateUndocumentedPermissions.java View source code |
/**
* Searches for the preceeding sibling level comment before the given xml permission element
* @param permissionElement
* @return
*/
private static org.jsoup.nodes.Comment getPreceedingComment(org.jsoup.nodes.Element permissionElement) {
org.jsoup.nodes.Node node = permissionElement;
while (true) {
node = node.previousSibling();
if (node instanceof Comment) {
return (org.jsoup.nodes.Comment) node;
} else if (node instanceof org.jsoup.nodes.TextNode) {
// important, there is a trailing whitespace character after the comment that is considered as a node
continue;
} else if (node instanceof org.jsoup.nodes.Element) {
return null;
}
}
}Example 44
| Project: bavrd-core-master File: Face.java View source code |
public String formatText(String htmlBody) {
String cleanHtml = Jsoup.clean(htmlBody, FORMATTED_TEXT_WHITELIST);
Document bodyFragment = Jsoup.parseBodyFragment(cleanHtml);
StringBuffer output = new StringBuffer();
for (Node n : bodyFragment.body().childNodes()) {
output.append(sanitize(n));
}
return output.toString();
}Example 45
| Project: jmeter-master File: JsoupBasedHtmlParser.java View source code |
@Override
public void head(Node node, int depth) {
if (!(node instanceof Element)) {
return;
}
Element tag = (Element) node;
String tagName = tag.tagName().toLowerCase();
if (tagName.equals(TAG_BODY)) {
extractAttribute(tag, ATT_BACKGROUND);
} else if (tagName.equals(TAG_SCRIPT)) {
extractAttribute(tag, ATT_SRC);
} else if (tagName.equals(TAG_BASE)) {
String baseref = tag.attr(ATT_HREF);
try {
if (// Bugzilla 30713
!StringUtils.isEmpty(baseref)) {
baseUrl.url = ConversionUtils.makeRelativeURL(baseUrl.url, baseref);
}
} catch (MalformedURLException e1) {
throw new RuntimeException(e1);
}
} else if (tagName.equals(TAG_IMAGE)) {
extractAttribute(tag, ATT_SRC);
} else if (tagName.equals(TAG_APPLET)) {
extractAttribute(tag, ATT_CODE);
} else if (tagName.equals(TAG_OBJECT)) {
extractAttribute(tag, ATT_CODEBASE);
extractAttribute(tag, ATT_DATA);
} else if (tagName.equals(TAG_INPUT)) {
// we check the input tag type for image
if (ATT_IS_IMAGE.equalsIgnoreCase(tag.attr(ATT_TYPE))) {
// then we need to download the binary
extractAttribute(tag, ATT_SRC);
}
// Bug 51750
} else if (tagName.equals(TAG_FRAME) || tagName.equals(TAG_IFRAME)) {
extractAttribute(tag, ATT_SRC);
} else if (tagName.equals(TAG_EMBED)) {
extractAttribute(tag, ATT_SRC);
} else if (tagName.equals(TAG_BGSOUND)) {
extractAttribute(tag, ATT_SRC);
} else if (tagName.equals(TAG_LINK)) {
// Putting the string first means it works even if the attribute is null
if (STYLESHEET.equalsIgnoreCase(tag.attr(ATT_REL))) {
extractAttribute(tag, ATT_HREF);
}
} else {
extractAttribute(tag, ATT_BACKGROUND);
}
// Now look for URLs in the STYLE attribute
String styleTagStr = tag.attr(ATT_STYLE);
if (styleTagStr != null) {
HtmlParsingUtils.extractStyleURLs(baseUrl.url, urls, styleTagStr);
}
}Example 46
| Project: link-bubble-master File: OutputFormatter.java View source code |
boolean unlikely(Node e) {
if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
return true;
String style = e.attr("style");
String clazz = e.attr("class");
if (unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find())
return true;
return false;
}Example 47
| Project: open-data-service-master File: PegelPortalMvSourceAdapter.java View source code |
private String extractText(Element element) {
StringBuilder builder = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
builder.append(node.toString());
} else if (node instanceof Element) {
builder.append(extractText((Element) node));
}
}
return builder.toString();
}Example 48
| Project: opensearchserver-master File: JSoupUtils.java View source code |
private static final void getNodes(Node parent, int pos, String[] path, List<Node> nodes) { if (pos == path.length) { nodes.add(parent); return; } List<Node> childrens = parent.childNodes(); int l = childrens.size(); int nextPos = pos + 1; for (int i = 0; i < l; i++) { Node node = childrens.get(i); if (node instanceof Element) { if (node.nodeName().equals(path[pos])) getNodes(node, nextPos, path, nodes); } } }
Example 49
| Project: owsi-core-parent-master File: AbstractNotificationContentDescriptorFactory.java View source code |
@Override
public void head(Node node, int depth) {
if (cssRegistry != null) {
String style = cleanAttribute(cssRegistry.getStyle(node));
if (StringUtils.hasText(style)) {
String existingStyleAttribute = cleanAttribute(node.attr(STYLE_ATTRIBUTE));
StringBuilder styleAttributeSb = new StringBuilder();
if (StringUtils.hasText(existingStyleAttribute)) {
styleAttributeSb.append(existingStyleAttribute);
styleAttributeSb.append(STYLE_ATTRIBUTE_SEPARATOR);
}
styleAttributeSb.append(style);
node.attr(STYLE_ATTRIBUTE, styleAttributeSb.toString());
}
}
if (LINK_TAG.equals(node.nodeName())) {
node.attr(LINK_TARGET_ATTRIBUTE, LINK_TARGET_ATTRIBUTE_BLANK_VALUE);
}
}Example 50
| Project: structr-master File: Importer.java View source code |
/**
* Parse the code previously read by {@link Importer#readPage()} and treat it as page fragment.
*
* @param fragment
* @return
* @throws FrameworkException
*/
public boolean parse(final boolean fragment) throws FrameworkException {
init();
if (StringUtils.isNotBlank(code)) {
if (!isDeployment) {
logger.info("##### Start parsing code for page {} #####", new Object[] { name });
} else {
// a trailing slash to all void/self-closing tags so the XML parser can parse it correctly
code = code.replaceAll("<(area|base|br|col|command|embed|hr|img|input|keygen|link|meta|param|source|track|wbr)([^>]*)>", "<$1$2/>");
}
if (fragment) {
if (isDeployment) {
final List<Node> nodeList = Parser.parseXmlFragment(code, "");
parsedDocument = Document.createShell("");
final Element body = parsedDocument.body();
final Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
for (int i = nodes.length - 1; i > 0; i--) {
nodes[i].remove();
}
for (Node node : nodes) {
body.appendChild(node);
}
} else {
parsedDocument = Jsoup.parseBodyFragment(code);
}
} else {
if (isDeployment) {
parsedDocument = Jsoup.parse(code, "", Parser.xmlParser());
} else {
parsedDocument = Jsoup.parse(code);
}
}
} else {
if (!isDeployment) {
logger.info("##### Start fetching {} for page {} #####", new Object[] { address, name });
}
code = HttpHelper.get(address);
parsedDocument = Jsoup.parse(code);
}
return true;
}Example 51
| Project: structured-content-tools-master File: StripHtmlPreprocessor.java View source code |
@Override
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
// non breaking space
String text = textNode.text().replace(' ', ' ').trim();
if (!text.isEmpty()) {
buffer.append(text);
if (!text.endsWith(" ")) {
// the last text gets appended the extra space too but we remove it later
buffer.append(" ");
}
}
}
}Example 52
| Project: Vaadin-SignatureField-master File: DeclarativeTestBaseBase.java View source code |
/**
* Produce predictable html (attributes in alphabetical order), always
* include close tags
*/
private String elementToHtml(Element producedElem, StringBuilder sb) {
ArrayList<String> names = new ArrayList<String>();
for (Attribute a : producedElem.attributes().asList()) {
names.add(a.getKey());
}
Collections.sort(names);
sb.append("<" + producedElem.tagName() + "");
for (String attrName : names) {
sb.append(" ").append(attrName).append("=").append("\'").append(producedElem.attr(attrName)).append("\'");
}
sb.append(">");
for (Node child : producedElem.childNodes()) {
if (child instanceof Element) {
elementToHtml((Element) child, sb);
} else if (child instanceof TextNode) {
String text = ((TextNode) child).text();
sb.append(text.trim());
}
}
sb.append("</").append(producedElem.tagName()).append(">");
return sb.toString();
}Example 53
| Project: symphony-master File: Markdowns.java View source code |
@Override public void head(final org.jsoup.nodes.Node node, int depth) { if (node instanceof org.jsoup.nodes.TextNode) { final org.jsoup.nodes.TextNode textNode = (org.jsoup.nodes.TextNode) node; final org.jsoup.nodes.Node parent = textNode.parent(); if (parent instanceof org.jsoup.nodes.Element) { final Element parentElem = (Element) parent; if (!parentElem.tagName().equals("code")) { String text = textNode.getWholeText(); if (null != userQueryService) { try { final Set<String> userNames = userQueryService.getUserNames(text); for (final String userName : userNames) { text = text.replace('@' + userName + " ", "@<a href='" + Latkes.getServePath() + "/member/" + userName + "'>" + userName + "</a> "); } text = text.replace("@participants ", "@<a href='https://hacpai.com/article/1458053458339' class='ft-red'>participants</a> "); } finally { JdbcRepository.dispose(); } } if (text.contains("@<a href=")) { final List<org.jsoup.nodes.Node> nodes = Parser.parseFragment(text, parentElem, ""); final int index = textNode.siblingIndex(); parentElem.insertChildren(index, nodes); toRemove.add(node); } else { textNode.text(Pangu.spacingText(text)); } } } } }
Example 54
| Project: aMatch-master File: QuestionSearch.java View source code |
public Question[] loadRecentQuestions() throws IOException {
List<Question> questionsList = new ArrayList<Question>();
String url = urlParser.ParseUrl();
// fetch the specified URL and parse to a HTML DOM
Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get();
// POPULATE QUESTION TEXT
String selector = "span[class=entry] > a";
// get each element that matches the CSS selector
Elements elements = doc.select(selector);
for (Element element : elements) {
String plainText = getPlainText(element);
int lastNewLineIndex = plainText.lastIndexOf('\n');
String questionText;
if (lastNewLineIndex == -1) {
questionText = "";
} else {
questionText = plainText.substring(0, plainText.lastIndexOf('\n'));
}
Question nextQuestion = new Question(questionText, urlParser.getParsedPageNumber());
String[] lineCounter = questionText.split("\n");
int lineCount = lineCounter.length + 1;
nextQuestion.questionTextLineCount = lineCount;
questionsList.add(nextQuestion);
// System.out.println(plainText);
// System.out.println(questionText);
// System.out.println(lineCount);
// System.out.println(questionsList.size());
}
// POPULATE ID
selector = "span[class=entry] a[href~=/question\\?id]";
// get each element that matches the CSS selector
elements = doc.select(selector);
int index = 0;
for (Element element : elements) {
String plainText = element.attr("href");
// int idIndex = plainText.indexOf("=");
// String id = plainText.substring(idIndex+1);
Question nextQuestion = questionsList.get(index);
nextQuestion.id = plainText;
++index;
// System.out.println(plainText);
}
// POPULATE COMPANY
selector = "span[class=company] img";
// get each element that matches the CSS selector
elements = doc.select(selector);
index = 0;
for (Element element : elements) {
String companyTitle = element.attr("title");
Question nextQuestion = questionsList.get(index);
nextQuestion.company = companyTitle;
++index;
// System.out.println(companyTitle);
}
// POPULATE COMPANY URL
selector = "span[class=company] img";
// get each element that matches the CSS selector
elements = doc.select(selector);
index = 0;
for (Element element : elements) {
String companyImgURL = element.attr("src");
Question nextQuestion = questionsList.get(index);
nextQuestion.companyImgURL = companyImgURL;
++index;
// System.out.println(companyImgURL);
// System.out.println(index + "size = " + questionsList.size());
}
// POPULATE DATES AND LOCATIONS
selector = "abbr[class=timeago]";
// get each element that matches the CSS selector
elements = doc.select(selector);
index = 0;
for (Element element : elements) {
Question nextQuestion = questionsList.get(index);
nextQuestion.dateText = element.text();
nextQuestion.location = element.nextSibling().toString();
++index;
}
// POPULATE TAGS
selector = "span[class=tags]";
// get each element that matches the CSS selector
elements = doc.select(selector);
index = 0;
for (Element element : elements) {
List<String> tagsList = new ArrayList<String>();
for (Node child : element.childNodes()) {
String tagsRaw = child.toString();
int parseStart = tagsRaw.indexOf(">");
int parseEnd = tagsRaw.lastIndexOf("<");
if (parseEnd != -1) {
String tags = tagsRaw.substring(parseStart + 1, parseEnd);
// System.out.println(tags);
tagsList.add(tags);
}
}
String[] tags = tagsList.toArray(new String[tagsList.size()]);
Question nextQuestion = questionsList.get(index);
nextQuestion.tags = tags;
++index;
}
return questionsList.toArray(new Question[questionsList.size()]);
}Example 55
| Project: Android_RssReader-master File: Readability.java View source code |
// private static String GetArticleTitle(Element htmlNode)
// {
// if (htmlNode.getElementsByTag("title") == null)
// return null;
//
// Element titleNode = htmlNode.getElementsByTag("title").get(0);
//
// String currTitle, origTitle;
// currTitle = origTitle = GetInnerText(titleNode);
//
// if (Regex.IsMatch(currTitle, @" [\|\-] "))
// {
// currTitle = Regex.Replace(origTitle, @"(.*)[\|\-] .*", "$1");
//
// if (currTitle.Split(' ').Length < 3)
// {
// currTitle = origTitle.Replace(@"[^\|\-]*[\|\-](.*)", "$1");
// }
// }
// else if (currTitle.IndexOf(": ") != -1)
// {
// currTitle = Regex.Replace(origTitle, @".*:(.*)", "$1");
//
// if(currTitle.Split(' ').Length < 3)
// {
// currTitle = Regex.Replace(origTitle, @"[^:]*[:](.*)", "$1");
// }
// }
// else if (currTitle.Length > 150 || currTitle.Length < 15)
// {
// var hOnes = htmlNode.GetElementsByTagName("h1");
// if (hOnes.Count == 1)
// {
// currTitle = GetInnerText(hOnes[0]);
// }
// }
//
// if (currTitle.Split(' ').Length <= 4)
// {
// currTitle = origTitle;
// }
//
// return currTitle.Trim();
// }
private static String GetArticleContent(Document doc) {
Element body = doc.body();
List<Element> allElements = body.getAllElements();
List<Element> nodesToScore = new ArrayList<Element>();
for (int nodeIndex = 0, len = allElements.size(); nodeIndex < len; nodeIndex++) {
Element node = allElements.get(nodeIndex);
String unlikelyMatchString = node.hasAttr("class") ? node.attr("class") : "" + node.attr("id");
if (s_unlikelyCandidates.matcher(unlikelyMatchString).find() && !s_okMaybeItsACandidate.matcher(unlikelyMatchString).find() && !node.nodeName().equals("body") && !node.nodeName().equals("html") && !node.nodeName().equals("head")) {
node.remove();
continue;
}
if (node.nodeName().equals("p") || node.nodeName().equals("td") || node.nodeName().equals("pre")) {
nodesToScore.add(node);
}
if (node.nodeName().equals("div")) {
if (!s_divToPElements.matcher(node.html()).find()) {
if (node.ownerDocument() != null) {
Element newNode = node.ownerDocument().createElement("p");
newNode.html(node.html());
node.replaceWith(newNode);
nodesToScore.add(newNode);
}
} else {
for (Node childNode : node.childNodes()) {
if (childNode instanceof TextNode) {
if (node.ownerDocument() != null) {
Element p = node.ownerDocument().createElement("p");
p.html(((TextNode) childNode).text());
childNode.replaceWith(p);
}
}
}
}
}
}
Map<Element, Integer> scores = new HashMap<Element, Integer>();
List<Element> candidates = new ArrayList<Element>();
for (int pt = 0, len = nodesToScore.size(); pt < len; pt++) {
Element parentNode = nodesToScore.get(pt).parent();
Element grandParentNode = parentNode != null ? parentNode.parent() : null;
String innerText = GetInnerText(nodesToScore.get(pt));
if (parentNode == null)
continue;
if (parentNode.nodeName().equals("body"))
continue;
if (parentNode.nodeName().equals("html"))
continue;
if (parentNode.nodeName().equals("footer"))
continue;
if (parentNode != null && parentNode.hasAttr("class") && parentNode.attr("class").equals("copyright"))
continue;
if (innerText.length() < 25)
continue;
if (!scores.containsKey(parentNode)) {
scores.put(parentNode, CalculateNodeScore(parentNode));
candidates.add(parentNode);
}
if (grandParentNode != null && !scores.containsKey(grandParentNode)) {
scores.put(grandParentNode, CalculateNodeScore(grandParentNode));
candidates.add(grandParentNode);
}
int contentScore = 0;
contentScore++;
//for embed flash case
if (innerText.contains("embed") && (innerText.contains("youku") || innerText.contains("tudou") || innerText.contains("ku6") || innerText.contains("sohu") || innerText.contains("weiphone") || innerText.contains("56") || innerText.contains("youtube") || innerText.contains("qq")))
contentScore += 50;
contentScore += innerText.split("[,]|[,]").length;
contentScore += Math.min(innerText.length() / 100, 3);
int v = scores.get(parentNode);
v += contentScore;
scores.put(parentNode, v);
if (grandParentNode != null) {
v = scores.get(grandParentNode);
v += contentScore / 2;
scores.put(grandParentNode, v);
}
}
Element topCandidate = null;
for (Element cand : candidates) {
int v = scores.get(cand);
v = (int) (v * (1 - GetLinkDensity(cand)));
scores.put(cand, v);
if (topCandidate == null || scores.get(cand) > scores.get(topCandidate)) {
topCandidate = cand;
}
if (topCandidate == null || topCandidate.nodeName().equals("body")) {
topCandidate = doc.createElement("div");
topCandidate.html(body.html());
body.html("");
body.appendChild(topCandidate);
scores.put(topCandidate, CalculateNodeScore(topCandidate));
}
}
return topCandidate == null ? null : topCandidate.html();
}Example 56
| Project: brightspot-cms-master File: RichTextViewBuilder.java View source code |
// Traverses the siblings all the way down the tree, collapsing balanced // blocks of HTML that do NOT contain any rich text elements into a single // HTML string. private List<RichTextViewNode<V>> toViewNodes(List<Node> siblings) { List<RichTextViewNode<V>> viewNodes = new ArrayList<>(); for (Node sibling : siblings) { if (sibling instanceof Element) { Element element = (Element) sibling; RichTextElement rte = RichTextElement.fromElement(element); ObjectType tagType = rte != null ? rte.getState().getType() : null; if (rte != null && elementToView != null) { viewNodes.add(new ElementRichTextViewNode<>(rte, elementToView)); } else if (tagType == null || keepUnboundElements) { List<RichTextViewNode<V>> childViewNodes = toViewNodes(element.childNodes()); String html = element.outerHtml(); if (element.tag().isSelfClosing()) { viewNodes.add(new StringRichTextViewNode<>(html, htmlToView)); } else { int firstGtAt = html.indexOf('>'); int lastLtAt = html.lastIndexOf('<'); // This deliberately does not validate the index values // above, since non-self-closing element should always // have those characters present in the HTML. viewNodes.add(new StringRichTextViewNode<>(html.substring(0, firstGtAt + 1), htmlToView)); viewNodes.addAll(childViewNodes); viewNodes.add(new StringRichTextViewNode<>(html.substring(lastLtAt), htmlToView)); } } } else if (sibling instanceof TextNode) { viewNodes.add(new StringRichTextViewNode<>(((TextNode) sibling).text(), htmlToView)); } else if (sibling instanceof DataNode) { viewNodes.add(new StringRichTextViewNode<>(((DataNode) sibling).getWholeData(), htmlToView)); } } // Collapse the nodes as much as possible. List<RichTextViewNode<V>> collapsed = new ArrayList<>(); List<StringRichTextViewNode<V>> adjacent = new ArrayList<>(); for (RichTextViewNode<V> childBuilderNode : viewNodes) { if (childBuilderNode instanceof StringRichTextViewNode) { adjacent.add((StringRichTextViewNode<V>) childBuilderNode); } else { collapsed.add(new StringRichTextViewNode<>(adjacent.stream().map(StringRichTextViewNode::getHtml).collect(Collectors.joining()), htmlToView)); adjacent.clear(); collapsed.add(childBuilderNode); } } if (!adjacent.isEmpty()) { collapsed.add(new StringRichTextViewNode<>(adjacent.stream().map(StringRichTextViewNode::getHtml).collect(Collectors.joining()), htmlToView)); } return collapsed; }
Example 57
| Project: dogeared-extruder-master File: Readability.java View source code |
private Element changeElementTag(Element e, String newTag) {
Element newElement = document.createElement(newTag);
/* JSoup gives us the live child list, so we need to make a copy. */
List<Node> copyOfChildNodeList = new ArrayList<Node>();
copyOfChildNodeList.addAll(e.childNodes());
for (Node n : copyOfChildNodeList) {
n.remove();
newElement.appendChild(n);
}
e.replaceWith(newElement);
return newElement;
}Example 58
| Project: ez-vcard-master File: HCardElementTest.java View source code |
@Test
public void append_with_newlines() {
HCardElement element = build("<div />");
element.append("Append\rthis\n\ntext\r\nplease.");
Iterator<Node> it = element.getElement().childNodes().iterator();
assertTextNodeValue(it.next(), "Append");
assertTagName(it.next(), "br");
assertTextNodeValue(it.next(), "this");
assertTagName(it.next(), "br");
assertTagName(it.next(), "br");
assertTextNodeValue(it.next(), "text");
assertTagName(it.next(), "br");
assertTextNodeValue(it.next(), "please.");
assertFalse(it.hasNext());
}Example 59
| Project: Java-readability-master File: Readability.java View source code |
private Element changeElementTag(Element e, String newTag) {
Element newElement = document.createElement(newTag);
/* JSoup gives us the live child list, so we need to make a copy. */
List<Node> copyOfChildNodeList = new ArrayList<Node>();
copyOfChildNodeList.addAll(e.childNodes());
for (Node n : copyOfChildNodeList) {
n.remove();
newElement.appendChild(n);
}
e.replaceWith(newElement);
return newElement;
}Example 60
| Project: jooby-master File: Doc.java View source code |
@Override
public void head(final Node node, final int depth) {
if (!isInToc) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
// non-break spaces
String txt = textNode.text().replaceAll(" ", " ");
builder.append(txt);
} else if (node instanceof Element) {
Element element = (Element) node;
switch(element.tagName()) {
case "span":
case "blockquote":
// ignored
break;
case "ol":
case "ul":
listDepth += 1;
case "br":
case "p":
builder.append("\n");
break;
case "div":
builder.append("\n");
break;
case "h1":
builder.append("\n# ");
break;
case "h2":
builder.append("\n## ");
break;
case "h3":
builder.append("\n### ");
break;
case "h4":
builder.append("\n#### ");
case "b":
case "strong":
builder.append("**");
break;
case "cite":
case "i":
case "u":
builder.append("*");
break;
case "a":
builder.append('[');
break;
case "li":
for (int i = 0; i < listDepth - 1; i++) {
builder.append(" ");
}
builder.append(element.parent().tagName().equals("ol") ? "1. " : "* ");
break;
case "code":
builder.append("`");
break;
case "strike":
builder.append("<").append(element.tagName()).append(">");
break;
case "img":
String src = element.attr("src");
String alt = element.attr("alt");
alt = alt == null ? "" : alt;
if (src != null) {
builder.append(".append(src).append(")\n");
}
break;
case "pre":
builder.append("```\n");
break;
case "hr":
builder.append("\n***\n");
break;
case "font":
String face = element.attr("face");
if (face != null && face.contains("monospace")) {
builder.append("`");
}
break;
default:
log.debug("Unhandled element {}", element.tagName());
}
}
}
}Example 61
| Project: LastCalc-master File: MainPageServlet.java View source code |
@Override
protected void doGet(final javax.servlet.http.HttpServletRequest req, final javax.servlet.http.HttpServletResponse resp) throws javax.servlet.ServletException, java.io.IOException {
final boolean skipUACheck = req.getParameterMap().containsKey("skipuacheck");
if (!skipUACheck && req.getHeader("User-Agent").contains("MSIE")) {
resp.sendRedirect("/noie.html");
return;
}
final URL requestURL = new URL(req.getRequestURL().toString());
final String path = requestURL.getPath();
final Objectify obj = DAO.begin();
if (path.equals("/favicon.ico")) {
resp.sendError(404);
return;
}
if (path.equals("/")) {
// Create a new worksheet and redirect to it
final Worksheet worksheet = new Worksheet();
obj.save().entity(worksheet).now();
resp.sendRedirect("/" + worksheet.id + (skipUACheck ? "?skipuacheck=1" : ""));
} else {
final String worksheetId = path.substring(1);
if (worksheetId.length() == 8) {
// This is readonly, duplicate it and redirect to
// a new id
final Worksheet worksheet = new Worksheet();
final Worksheet template = obj.load().type(Worksheet.class).filter("readOnlyId", worksheetId).first().get();
if (template == null) {
resp.sendError(404);
return;
}
worksheet.parentId = worksheet.id;
worksheet.qaPairs = template.qaPairs;
obj.save().entity(worksheet);
resp.sendRedirect("/" + worksheet.id);
} else {
final Worksheet worksheet;
try {
worksheet = obj.load().type(Worksheet.class).id(worksheetId).get();
} catch (final NotFoundException e) {
resp.sendError(404, "Worksheet not found");
return;
}
final Document doc = createDocument(requestURL, worksheet);
// doc.body().appendElement("iframe").attr("id",
// "helpframe").attr("src", "/help")
// .attr("frameBorder", "0");
final Element helpDiv = doc.body().appendElement("div").attr("id", "helpframe").attr("style", "display: none;");
for (final Node n : Help.getHelpDoc().body().childNodes()) {
helpDiv.appendChild(n.clone());
}
int lineNo = 1;
final SequentialParser sp = SequentialParser.create();
Element worksheetElement = doc.body().select("#worksheet").first();
for (final Line qa : worksheet.qaPairs) {
sp.processNextAnswer(qa.answer);
final Element lineEl = worksheetElement.appendElement("div").addClass("line").attr("id", "line" + lineNo);
if (lineNo == 1) {
lineEl.addClass("firstLine");
}
final Element lineNumber = lineEl.appendElement("div").attr("class", "lineNumberMarker");
lineNumber.text(lineNo + ".");
final Element question = lineEl.appendElement("div").attr("class", "question").attr("contentEditable", "true");
question.text(qa.question);
final TokenList strippedAnswer = sp.stripUDF(qa.answer);
final AnswerType aType = WorksheetServlet.getAnswerType(strippedAnswer);
if (aType.equals(AnswerType.NORMAL)) {
lineEl.appendElement("div").attr("class", "equals").text("=");
lineEl.appendElement("div").attr("class", "answer").html(Renderers.toHtml("/", strippedAnswer).toString());
} else {
lineEl.appendElement("div").attr("class", "equals").html("<span style=\"font-size:10pt;\">✓</span>");
lineEl.appendElement("div").attr("class", "answer");
}
lineNo++;
}
doc.body().attr("data-variables", Misc.gson.toJson(sp.getUserDefinedKeywordMap()));
final Element lineEl = worksheetElement.appendElement("div").addClass("line").attr("id", "line" + lineNo);
if (lineNo == 1) {
lineEl.addClass("firstLine");
}
final Element lineNumber = lineEl.appendElement("div").attr("class", "lineNumberMarker");
lineNumber.text(lineNo + ".");
final Element question = lineEl.appendElement("div").attr("class", "question").attr("contentEditable", "true");
final Element equals = lineEl.appendElement("div").attr("class", "equals").text("=").attr("style", "display:none;");
lineEl.appendElement("div").attr("class", "answer").attr("style", "display:none;");
resp.setContentType("text/html; charset=UTF-8");
resp.getWriter().append(doc.toString());
}
}
}Example 62
| Project: Lightning-Browser-master File: OutputFormatter.java View source code |
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
for (Node child : e.childNodes()) {
if (unlikely(child)) {
continue;
}
if (child instanceof TextNode) {
TextNode textNode = (TextNode) child;
String txt = textNode.text();
accum.append(txt);
} else if (child instanceof Element) {
Element element = (Element) child;
if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
accum.append(' ');
else if (element.tagName().equals("br"))
accum.append(' ');
appendTextSkipHidden(element, accum, indent + 1);
}
}
}Example 63
| Project: NiceText-master File: NTHelper.java View source code |
@Override
public void head(Node node, int depth) {
if (node instanceof Element) {
Element innerElement = (Element) node;
Element parentElement = innerElement.parent();
if (parentElement != null) {
parentTextSize = parentElement.ownText().length();
}
//if ((innerElement.isBlock() || POSSIBLE_TEXT_NODES.matcher(innerElement.tagName()).matches())&& innerElement.text().length()>50) {
if (innerElement.ownText().length() >= WORDS_T && parentTextSize == 0) {
flatDOM.add(innerElement);
}
}
}Example 64
| Project: OpenLegislation-master File: BillTextTest.java View source code |
void processNode(Element ele, StringBuilder stringBuilder) {
for (Node t : ele.childNodes()) {
if (t instanceof Element) {
Element e = (Element) t;
if (e.tag().getName().equals("u")) {
stringBuilder.append(e.text().toUpperCase());
stringBuilder.append("\n");
} else {
processNode(e, stringBuilder);
}
} else if (t instanceof TextNode) {
stringBuilder.append(((TextNode) t).text());
stringBuilder.append("\n");
}
}
}Example 65
| Project: shopb2b-master File: Article.java View source code |
@Transient
public String[] getPageContents() {
if (StringUtils.isEmpty(this.content))
return new String[] { "" };
if (this.content.contains(contentBreake))
return this.content.split(contentBreake);
ArrayList<String> localArrayList = new ArrayList<String>();
org.jsoup.nodes.Document localDocument = Jsoup.parse(this.content);
List<Node> localList = localDocument.body().childNodes();
if (localList != null) {
int i = 0;
StringBuffer localStringBuffer = new StringBuffer();
Iterator<Node> localIterator = localList.iterator();
while (localIterator.hasNext()) {
Node localObject1 = (Node) localIterator.next();
Object localObject2;
if ((localObject1 instanceof org.jsoup.nodes.Element)) {
localObject2 = (org.jsoup.nodes.Element) localObject1;
localStringBuffer.append(((org.jsoup.nodes.Element) localObject2).outerHtml());
i += ((org.jsoup.nodes.Element) localObject2).text().length();
if (i < MAX_PAGE_CONTENT_COUNT)
continue;
localArrayList.add(localStringBuffer.toString());
i = 0;
localStringBuffer.setLength(0);
} else {
if (!(localObject1 instanceof TextNode))
continue;
localObject2 = (TextNode) localObject1;
String str1 = ((TextNode) localObject2).text();
String[] arrayOfString1 = pattern.split(str1);
Matcher localMatcher = pattern.matcher(str1);
for (String str2 : arrayOfString1) {
if (localMatcher.find())
str2 = str2 + localMatcher.group();
localStringBuffer.append(str2);
i += str2.length();
if (i < MAX_PAGE_CONTENT_COUNT)
continue;
localArrayList.add(localStringBuffer.toString());
i = 0;
localStringBuffer.setLength(0);
}
}
}
String localObject1 = localStringBuffer.toString();
if (StringUtils.isNotEmpty((String) localObject1))
localArrayList.add(localObject1);
}
return (String[]) localArrayList.toArray(new String[localArrayList.size()]);
}Example 66
| Project: act-master File: PatentDocument.java View source code |
@Override
public void tail(org.jsoup.nodes.Node node, int i) {
String nodeName = node.nodeName();
if (nodeName.equals("a")) {
// Same as Jsoup's HtmlToPlainText.
segmentBuilder.append(String.format(" <%s>", node.absUrl("href")));
} else if (SEGMENTING_NODES.contains(nodeName) && segmentBuilder.length() > 0) {
String segmentText = segmentBuilder.toString();
// Ignore blank lines, as we'll be tagging each line separately.
if (!SPACE_PATTERN.matcher(segmentText).matches()) {
this.textSegments.add(segmentText);
}
// TODO: is it better to drop the old one than clear the existing?
segmentBuilder.setLength(0);
}
}Example 67
| Project: awesome-blogs-android-master File: DocumentConverter.java View source code |
// Utility method to quickly walk the DOM tree and estimate the size of the
// buffer necessary to hold the result.
private static int calculateLength(Element el, int depth) {
int result = 0;
for (final Node n : el.childNodes()) {
if (n instanceof Element) {
result += (4 * depth) + calculateLength((Element) n, depth + 1);
} else if (n instanceof TextNode) {
result += ((TextNode) n).text().length();
}
}
return result;
}Example 68
| Project: elasticsearch-river-remote-master File: GetSitemapHtmlClient.java View source code |
@Override
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
// non breaking space
String text = textNode.text().replace(' ', ' ').trim();
if (!text.isEmpty()) {
buffer.append(text);
if (!text.endsWith(" ")) {
buffer.append(" ");
}
}
}
}Example 69
| Project: JAVMovieScraper-master File: AvEntertainmentParsingProfile.java View source code |
@Override
public Runtime scrapeRuntime() {
String runtime = "";
Elements elements = document.select("div[id=titlebox] ul li");
for (Element element : elements) {
if (element.childNodeSize() == 3) {
Node childNode = element.childNode(2);
if (childNode instanceof TextNode && (element.childNode(1).childNode(0).toString().startsWith("Playing time") || element.childNode(1).childNode(0).toString().startsWith("�録時間"))) {
String data = element.childNode(2).toString();
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher(data);
if (matcher.find()) {
runtime = matcher.group();
break;
}
}
}
}
return new Runtime(runtime);
}Example 70
| Project: jHTML2Md-master File: HTML2Md.java View source code |
private static String getTextContent(Element element) {
ArrayList<MDLine> lines = new ArrayList<MDLine>();
List<Node> children = element.childNodes();
for (Node child : children) {
if (child instanceof TextNode) {
TextNode textNode = (TextNode) child;
MDLine line = getLastLine(lines);
if (line.getContent().equals("")) {
if (!textNode.isBlank()) {
line.append(textNode.text().replaceAll("#", "/#").replaceAll("\\*", "/\\*"));
}
} else {
line.append(textNode.text().replaceAll("#", "/#").replaceAll("\\*", "/\\*"));
}
} else if (child instanceof Element) {
Element childElement = (Element) child;
processElement(childElement, lines);
} else {
System.out.println();
}
}
int blankLines = 0;
StringBuilder result = new StringBuilder();
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i).toString().trim();
if (line.equals("")) {
blankLines++;
} else {
blankLines = 0;
}
if (blankLines < 2) {
result.append(line);
if (i < lines.size() - 1) {
result.append("\n");
}
}
}
return result.toString();
}Example 71
| Project: jodtemplate-master File: HtmlStylizer.java View source code |
private List<Element> process(final org.jsoup.nodes.Element element, final Element arPr, final Element apPr, final Slide slide) throws IOException {
if (BR_TAG.equals(element.tagName())) {
return Arrays.asList(new Element(PPTXDocument.BR_ELEMENT, getDrawingmlNamespace()));
}
final List<org.jsoup.nodes.Element> tags = getAllTags(element);
final List<Element> elements = new ArrayList<>();
for (Node node : element.childNodes()) {
if (node instanceof org.jsoup.nodes.Element) {
elements.addAll(process((org.jsoup.nodes.Element) node, arPr, apPr, slide));
} else if (node instanceof TextNode) {
final TextNode textNode = (TextNode) node;
elements.add(createTextElement(tags, arPr, textNode, slide));
}
}
if (LI_TAG.equals(element.tagName())) {
return createListElements(tags, elements, apPr, element);
}
if (P_TAG.equals(element.tagName())) {
return Arrays.asList(createParagraphElement(elements, apPr));
}
return elements;
}Example 72
| Project: Prophet-master File: QTreeHTMLHandler.java View source code |
/**
* Creates a 'table' <code>Element</code> (using {@link Object#toString()} from the given data.
* Any <code>null</code> values in <code>header</code> or <code>rows</code> (and its sub-arrays) will be ignored.
* Any cells of the table that should be interpreted as HTML must be given as <code>Node</code> instances. Otherwise
* HTML in the <code>String</code> returned by {@link Object#toString()} will be escaped.
*
* @param header the optional header for the table
* @param rows the rows for the table
* @return the 'table' <code>Element</code>
*/
public static Element table(Object[] header, Object[]... rows) {
Element table = new Element(Tag.valueOf("table"), "");
if (header != null) {
Element headerRowEl = table.appendElement("tr");
Element headerColEl;
for (Object headerData : header) {
if (headerData == null) {
continue;
}
headerColEl = headerRowEl.appendElement("th");
if (headerData instanceof Node) {
headerColEl.appendChild((Node) headerData);
} else {
headerColEl.text(headerData.toString());
}
}
}
if (rows != null) {
Element rowEl;
Element colEl;
for (Object[] row : rows) {
if (row == null) {
continue;
}
rowEl = table.appendElement("tr");
for (Object rowData : row) {
if (rowData == null) {
continue;
}
colEl = rowEl.appendElement("td");
if (rowData instanceof Node) {
colEl.appendChild((Node) rowData);
} else {
colEl.text(rowData.toString());
}
}
}
}
return table;
}Example 73
| Project: scheduler-legacy-master File: CourseParser.java View source code |
/**
* Parse the Section Detail information page to retrieve the seating availability, registration restrictions,
* and prerequisites information
*
* @param document the Section Detail page HTML document
* @param values the retrieved course data set, including the newly added Section Detail values
*/
private void parseCourseDetail(Document document, Map<String, String> values) {
Elements availabilityHeaders = document.select("caption:containsOwn(Registration Availability) + tbody th.ddheader span");
Elements availabilityValues = document.select("caption:containsOwn(Registration Availability) + tbody td.dddefault");
for (int pos = 0; pos < availabilityHeaders.size(); pos++) {
String header = availabilityHeaders.get(pos).text();
String value = availabilityValues.get(pos).text();
values.put("seating." + header, value);
}
Element restrictionElement = document.select("span:containsOwn(Restriction)").first();
try {
for (Node node = restrictionElement.nextSibling(); !(node instanceof Element && ((Element) node).tag().equals(Tag.valueOf("span"))); node = node.nextSibling()) {
logger.debug("Restriction: {}", node);
///TODO handle the restrictions list - grouping of restrictions (or restriction list elements) indicated by indentation
}
} catch (NullPointerException e) {
logger.debug("No restriction found", e);
}
Element prerequisiteElement = document.select("span:containsOwn(Prerequisite)").first();
try {
for (Node node = prerequisiteElement.nextSibling(); node != null; node = node.nextSibling()) {
logger.debug("Prereq: {}", node);
//TODO handle the prerequisite list - can be AND-OR or OR-AND formatted (keywords 'and' 'or' present to indicate w/ parentheses for grouping
}
} catch (Exception e) {
logger.debug("No prequisite found", e);
}
}Example 74
| Project: WiFiAfterConnect-master File: HtmlPage.java View source code |
@Override
public boolean parse(String html) {
Log.d(Constants.TAG, "Page " + this);
if (!super.parse(html))
return false;
Document doc = Jsoup.parse(html);
if (doc == null) {
Log.d(Constants.TAG, "Parsing html: doc == null");
return false;
}
Log.d(Constants.TAG, "Parsing html: doc html == {" + doc.html() + "}");
// some portals sneak form to outside of <div id="content"> - the bastards!
Element content = doc;
for (Element meta : content.getElementsByTag("meta")) {
String c = meta.attr("content");
if (!c.isEmpty()) {
if (meta.hasAttr("http-equiv"))
httpEquivMetas.put(meta.attr("http-equiv").toLowerCase(Locale.ENGLISH), c);
else if (meta.hasAttr("name"))
namedMetas.put(meta.attr("name").toLowerCase(Locale.ENGLISH), c);
}
}
for (Element te : content.getElementsByTag("title")) {
title = te.data();
if (!title.isEmpty())
break;
}
for (Element body : content.getElementsByTag("body")) {
Log.d(Constants.TAG, "Parsing html: body found.");
if (body.hasAttr("onLoad")) {
onLoad = body.attr("onLoad");
break;
}
}
for (Element fe : content.getElementsByTag("form")) {
HtmlForm f = new HtmlForm(fe);
forms.add(f);
Log.d(Constants.TAG, "Parsing html: form added. Forms == " + forms.toString());
String fid = f.getId();
if (!fid.isEmpty())
namedForms.put(fid, f);
}
for (Element head : content.getElementsByTag("head")) {
for (Element jse : head.getElementsByTag("script")) {
if (isJavaScript(jse)) {
JavaScript j = new JavaScript(jse);
headJavaScripts.add(j);
Log.d(Constants.TAG, "Parsing html: HEAD JS added. javaScripts = " + headJavaScripts.toString());
}
}
if (!headJavaScripts.isEmpty())
checkJavaScriptForMetaRefresh();
}
for (Element body : content.getElementsByTag("body")) {
for (Element jse : body.getElementsByTag("script")) {
if (isJavaScript(jse)) {
JavaScript j = new JavaScript(jse);
bodyJavaScripts.add(j);
Log.d(Constants.TAG, "Parsing html: HEAD JS added. javaScripts = " + bodyJavaScripts.toString());
}
}
}
for (Element ie : content.getElementsByTag("input")) {
HtmlInput i = new HtmlInput(ie, false);
String fid = i.getFormId();
if (!fid.isEmpty()) {
HtmlForm f = namedForms.get(fid);
if (f != null)
f.addInput(i);
}
}
for (Element e : doc.getAllElements()) {
for (Node n : e.childNodes()) {
if (n instanceof Comment) {
String commentData = ((Comment) n).getData();
if (commentData.startsWith("<?xml")) {
WISPAccessGatewayParam wp = WISPAccessGatewayParam.parse(commentData);
if (wp != null)
wISPr = wp;
}
}
}
}
return true;
}Example 75
| Project: Ouroboros-master File: CommentParser.java View source code |
private CharSequence parseFormatting(Element bodyLine, String currentBoard, String resto, FragmentManager fragmentManager, InfiniteDbHelper infiniteDbHelper) {
CharSequence parsedText = "";
for (Node childNode : bodyLine.childNodes()) {
if (childNode instanceof TextNode) {
parsedText = TextUtils.concat(parsedText, parseNormalText(new SpannableString(((TextNode) childNode).text())));
} else if (childNode instanceof Element) {
Element childElement = (Element) childNode;
switch(childElement.tagName()) {
default:
parsedText = TextUtils.concat(parsedText, parseNormalText(new SpannableString(childElement.text())));
break;
case "span":
CharSequence spanText = parseSpanText(childElement);
parsedText = TextUtils.concat(parsedText, spanText);
break;
case "em":
parsedText = TextUtils.concat(parsedText, parseItalicText(new SpannableString(childElement.text())));
break;
case "strong":
parsedText = TextUtils.concat(parsedText, parseBoldText(new SpannableString(childElement.text())));
break;
case "u":
parsedText = TextUtils.concat(parsedText, parseUnderlineText(new SpannableString(childElement.text())));
break;
case "s":
parsedText = TextUtils.concat(parsedText, parseStrikethroughText(new SpannableString(childElement.text())));
break;
case "a":
parsedText = TextUtils.concat(parsedText, parseAnchorText(childElement, currentBoard, resto, fragmentManager, infiniteDbHelper));
}
}
}
return parsedText;
}Example 76
| Project: Skype4J-master File: RichText.java View source code |
private static RichText parse(RichText root, Node node) {
RichText current = root;
if (node instanceof Element) {
Element elem = (Element) node;
applyTag(current, elem);
String inner = elem.html();
Elements children = elem.children();
if (children.size() > 0) {
String[] parts = new String[children.size() + 1];
int i = 0;
int index = 0;
for (Element child : children) {
int startChild = inner.indexOf("<" + child.tag().toString(), index);
int endChild = startChild + child.outerHtml().length();
parts[i++] = inner.substring(index, startChild);
index = endChild;
}
parts[i] = inner.substring(index);
Element last = elem;
for (int j = 0; j < parts.length; j++) {
if (hasTag(root, last)) {
current.appendText(parts[j]);
} else {
current = current.append(parts[j], true);
current.copyFormat(root);
}
if (j < children.size()) {
Element child = children.get(j);
if (!hasTag(current, child)) {
current = current.append("", true);
current.copyFormat(root);
}
current = parse(current, child);
last = child;
}
}
} else {
current.appendText(inner);
}
}
return current;
}Example 77
| Project: thredds-master File: NcepHtmlScraper.java View source code |
//////////////////////////////////////////////////////////////////
// http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_doc.shtml
void parseTopDoc() throws IOException {
String source = "http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_doc.shtml";
// 5 sec timeout
Document doc = Jsoup.parse(new URL(source), 5 * 1000);
//System.out.printf("%s%n", doc);
Elements links = doc.select("a[href]");
for (Element link : links) {
//System.out.printf("%s", link);
Node sib = link.nextSibling();
String title = null;
if (sib != null) {
String sibt = sib.toString();
title = StringUtil2.remove(sibt, "-").trim();
//System.out.printf(" == '%s'", title);
}
if (link.text().equals("Table 4.2")) {
//System.out.printf(" == ");
parseTable42(link.attr("abs:href"), link.text(), title);
} else {
if (link.text().startsWith("Table 4")) {
//System.out.printf(" == ");
parseCodeTable(link.attr("abs:href"), link.text(), title);
}
}
//System.out.printf("%n");
}
}Example 78
| Project: GPXConverter-master File: GarminForm.java View source code |
private static String findFlowKey(Node node) { String key = null; for (int i = 0; i < node.childNodes().size(); ) { Node child = node.childNode(i); if (child.nodeName().equals("#comment")) { //System.out.println(child.toString()); String flowKeyPattern = "\\<\\!-- flowExecutionKey\\: \\[(e1s1)\\] --\\>"; key = child.toString().replaceAll(flowKeyPattern, "$1").trim(); break; } else { findFlowKey(child); i++; } } return key; }
Example 79
| Project: opacclient-master File: SISIS.java View source code |
public SearchRequestResult parse_search(String html, int page) throws OpacErrorException, SingleResultFound {
Document doc = Jsoup.parse(html);
doc.setBaseUri(opac_url + "/searchfoo");
if (doc.select(".error").size() > 0) {
throw new OpacErrorException(doc.select(".error").text().trim());
} else if (doc.select(".nohits").size() > 0) {
throw new OpacErrorException(doc.select(".nohits").text().trim());
} else if (doc.select(".box-header h2, #nohits").text().contains("keine Treffer")) {
return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1);
}
int results_total = -1;
String resultnumstr = doc.select(".box-header h2").first().text();
if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) {
throw new SingleResultFound();
} else if (resultnumstr.contains("(")) {
results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1"));
} else if (resultnumstr.contains(": ")) {
results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1"));
}
Elements table = doc.select("table.data tbody tr");
identifier = null;
Elements links = doc.select("table.data a");
boolean haslink = false;
for (int i = 0; i < links.size(); i++) {
Element node = links.get(i);
if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) {
haslink = true;
try {
List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(node.attr("href").replace(" ", "%20").replace("&", "&")), ENCODING);
for (NameValuePair nv : anyurl) {
if (nv.getName().equals("identifier")) {
identifier = nv.getValue();
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
List<SearchResult> results = new ArrayList<>();
for (int i = 0; i < table.size(); i++) {
Element tr = table.get(i);
SearchResult sr = new SearchResult();
if (tr.select("td img[title]").size() > 0) {
String title = tr.select("td img").get(0).attr("title");
String[] fparts = tr.select("td img").get(0).attr("src").split("/");
String fname = fparts[fparts.length - 1];
MediaType default_by_fname = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", ""));
MediaType default_by_title = defaulttypes.get(title);
MediaType default_name = default_by_title != null ? default_by_title : default_by_fname;
if (data.has("mediatypes")) {
try {
sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname)));
} catch (JSONExceptionIllegalArgumentException | e) {
sr.setType(default_name);
}
} else {
sr.setType(default_name);
}
}
String alltext = tr.text();
if (alltext.contains("eAudio") || alltext.contains("eMusic")) {
sr.setType(MediaType.MP3);
} else if (alltext.contains("eVideo")) {
sr.setType(MediaType.EVIDEO);
} else if (alltext.contains("eBook")) {
sr.setType(MediaType.EBOOK);
} else if (alltext.contains("Munzinger")) {
sr.setType(MediaType.EDOC);
}
if (tr.children().size() > 3 && tr.child(3).select("img[title*=cover]").size() == 1) {
sr.setCover(tr.child(3).select("img[title*=cover]").attr("abs:src"));
if (sr.getCover().contains("showCover.do")) {
downloadCover(sr);
}
}
Element middlething;
if (tr.children().size() > 2 && tr.child(2).select("a").size() > 0) {
middlething = tr.child(2);
} else {
middlething = tr.child(1);
}
List<Node> children = middlething.childNodes();
if (middlething.select("div").not("#hlrightblock,.bestellfunktionen").size() == 1) {
Element indiv = middlething.select("div").not("#hlrightblock,.bestellfunktionen").first();
if (indiv.children().size() > 1) {
children = indiv.childNodes();
}
} else if (middlething.select("span.titleData").size() == 1) {
children = middlething.select("span.titleData").first().childNodes();
}
int childrennum = children.size();
List<String[]> strings = new ArrayList<>();
for (int ch = 0; ch < childrennum; ch++) {
Node node = children.get(ch);
if (node instanceof TextNode) {
String text = ((TextNode) node).text().trim();
if (text.length() > 3) {
strings.add(new String[] { "text", "", text });
}
} else if (node instanceof Element) {
List<Node> subchildren = node.childNodes();
for (int j = 0; j < subchildren.size(); j++) {
Node subnode = subchildren.get(j);
if (subnode instanceof TextNode) {
String text = ((TextNode) subnode).text().trim();
if (text.length() > 3) {
strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") });
}
} else if (subnode instanceof Element) {
String text = ((Element) subnode).text().trim();
if (text.length() > 3) {
strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") });
}
}
}
}
}
StringBuilder description = null;
if (tr.select("span.Z3988").size() == 1) {
// Sometimes there is a <span class="Z3988"> item which provides
// data in a standardized format.
List<NameValuePair> z3988data;
boolean hastitle = false;
try {
description = new StringBuilder();
z3988data = URLEncodedUtils.parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8");
for (NameValuePair nv : z3988data) {
if (nv.getValue() != null) {
if (!nv.getValue().trim().equals("")) {
if (nv.getName().equals("rft.btitle") && !hastitle) {
description.append("<b>").append(nv.getValue()).append("</b>");
hastitle = true;
} else if (nv.getName().equals("rft.atitle") && !hastitle) {
description.append("<b>").append(nv.getValue()).append("</b>");
hastitle = true;
} else if (nv.getName().equals("rft.au")) {
description.append("<br />").append(nv.getValue());
} else if (nv.getName().equals("rft.date")) {
description.append("<br />").append(nv.getValue());
}
}
}
}
} catch (URISyntaxException e) {
description = null;
}
}
boolean described = false;
if (description != null && description.length() > 0) {
sr.setInnerhtml(description.toString());
described = true;
} else {
description = new StringBuilder();
}
int k = 0;
boolean yearfound = false;
boolean titlefound = false;
boolean sigfound = false;
for (String[] part : strings) {
if (!described) {
if (part[0].equals("a") && (k == 0 || !titlefound)) {
if (k != 0) {
description.append("<br />");
}
description.append("<b>").append(part[2]).append("</b>");
titlefound = true;
} else if (part[2].matches("\\D*[0-9]{4}\\D*") && part[2].length() <= 10) {
yearfound = true;
if (k != 0) {
description.append("<br />");
}
description.append(part[2]);
} else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
if (k != 0) {
description.append("<br />");
}
description.append(part[2]);
} else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
if (k != 0) {
description.append("<br />");
}
description.append(part[2]);
} else if (k == 1 && !yearfound) {
description.append("<br />");
description.append(part[2]);
} else if (k > 1 && k < 4 && !sigfound && part[0].equals("text") && part[2].matches("^[A-Za-z0-9,\\- ]+$")) {
description.append("<br />");
description.append(part[2]);
}
}
if (part.length == 4) {
if (part[0].equals("span") && part[3].equals("textgruen")) {
sr.setStatus(SearchResult.Status.GREEN);
} else if (part[0].equals("span") && part[3].equals("textrot")) {
sr.setStatus(SearchResult.Status.RED);
}
} else if (part.length == 5) {
if (part[4].contains("purple")) {
sr.setStatus(SearchResult.Status.YELLOW);
}
}
if (sr.getStatus() == null) {
if ((part[2].contains("entliehen") && part[2].startsWith("Vormerkung ist leider nicht möglich")) || part[2].contains("nur in anderer Zweigstelle ausleihbar und nicht bestellbar")) {
sr.setStatus(SearchResult.Status.RED);
} else if (part[2].startsWith("entliehen") || part[2].contains("Ein Exemplar finden Sie in einer anderen Zweigstelle")) {
sr.setStatus(SearchResult.Status.YELLOW);
} else if ((part[2].startsWith("bestellbar") && !part[2].contains("nicht bestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vormerkbar") && !part[2].contains("nicht vormerkbar")) || (part[2].contains("heute zurückgebucht")) || (part[2].contains("ausleihbar") && !part[2].contains("nicht ausleihbar"))) {
sr.setStatus(SearchResult.Status.GREEN);
}
if (sr.getType() != null) {
if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO) || sr.getType().equals(MediaType.MP3)) // Especially Onleihe.de ebooks are often marked
// green though they are not available.
{
sr.setStatus(SearchResult.Status.UNKNOWN);
}
}
}
k++;
}
if (!described) {
sr.setInnerhtml(description.toString());
}
sr.setNr(10 * (page - 1) + i);
sr.setId(null);
results.add(sr);
}
resultcount = results.size();
return new SearchRequestResult(results, results_total, page);
}Example 80
| Project: MyTv-master File: TvMaoCrawler.java View source code |
/**
* 解�电视节目表
*
* @param html
* @return
*/
private List<ProgramTable> parseProgramTable(String html) {
Document doc = Jsoup.parse(html);
Elements dateElements = doc.select("div.pgmain div[class=\"mt10 clear\"] b:first-child");
String dateAndWeek = dateElements.get(0).text().trim();
String[] dateAndWeekArray = dateAndWeek.split("\\s+");
String date = Calendar.getInstance().get(Calendar.YEAR) + "-" + dateAndWeekArray[0];
String weekString = dateAndWeekArray[1];
int week = weekStringToInt(weekString);
Elements stationElements = doc.select("aside[class=\"related-aside rt\"] section[class=\"aside-section clear\"] div.bar");
String stationName = stationElements.get(0).text().trim();
Elements programElements = doc.select("ul#pgrow li");
List<ProgramTable> resultList = new ArrayList<ProgramTable>();
for (Element element : programElements) {
List<Node> children = element.childNodes();
int size = children.size();
if (size < 2) {
continue;
}
int i = 0;
// 查找节目æ’出时间
boolean foundAirTime = false;
for (; i < size; i++) {
Node child = children.get(i);
if (child instanceof Element && "SPAN".equalsIgnoreCase(((Element) child).tagName())) {
foundAirTime = true;
break;
}
}
if (!foundAirTime) {
logger.info("the program table of " + stationName + " at " + date + " does not exists.");
return resultList;
}
String airTime = ((Element) children.get(i++)).text().trim();
StringBuffer program = new StringBuffer();
// 查找节目å??ç§°
for (; i < size; i++) {
Node child = children.get(i);
if (child instanceof TextNode) {
program.append(((TextNode) child).text().trim());
} else if (child instanceof Element && "A".equalsIgnoreCase(((Element) child).tagName())) {
program.append(((Element) child).text().trim());
i++;
break;
}
}
if (i < size - 1) {
// 还有textnodeå…ƒç´
Node child = children.get(i);
if (child instanceof TextNode) {
program.append(((TextNode) child).text().trim());
}
}
ProgramTable pt = new ProgramTable();
pt.setAirDate(date);
pt.setAirTime(date + " " + airTime);
pt.setProgram(program.toString().trim());
pt.setStationName(stationName);
pt.setWeek(week);
for (CrawlEventListener listener : listeners) {
listener.itemFound(new ProgramTableFoundEvent(this, pt));
}
resultList.add(pt);
}
return resultList;
}Example 81
| Project: Diary.Ru-Client-master File: NetworkService.java View source code |
/**
* ФункциÑ? длÑ? применениÑ? модификаций ко вÑ?ем загружаемым Ñ?траницам дневников
* Сюда вноÑ?Ñ?Ñ‚Ñ?Ñ? правки Ñ?траниц по проÑ?ьбам пользователей
* @param resultPage Ñ?траница, которую нужно модифицировать
*/
private void mutateContent(Document resultPage) {
// Ñ?траница будет иметь наш Ñ?тиль
String theme = mPreferences.getString("app.theme", "red");
resultPage.head().append("<link rel=\"stylesheet\" href=\"file:///android_asset/css/" + theme + ".css\" type=\"text/css\" media=\"all\" title=\"Стандарт\"/>");
// кнопка репоÑ?та указывает на нужную Ñ?Ñ?ылку
Elements shareLinks = resultPage.select(".postLinks li[class^=quote]");
for (Element shareLi : shareLinks) {
if (shareLi.childNodeSize() == 0)
continue;
Element repostLink = shareLi.child(0);
Element diaryRepost = shareLi.select("div a[href*=newpost]").first();
if (diaryRepost != null)
repostLink.attr("href", diaryRepost.attr("href"));
}
// текÑ?Ñ‚ вмеÑ?то кнопок правки
if (mUseTextInsteadOfImages) {
Elements postActionImages = resultPage.select("ul.postActionLinks img");
for (Element img : postActionImages) {
// переделываем на текÑ?Ñ‚
if (img.hasAttr("title")) {
Node text = new TextNode(img.attr("title"), resultPage.baseUri());
img.replaceWith(text);
}
}
}
// правка JS
Elements jsElems = resultPage.getElementsByAttribute("onclick");
for (Element js : jsElems) {
String link = js.attr("href");
if (!link.contains("#more") && !link.contains("subscribe") && !link.contains("showresult") && !link.contains("up&signature=") && !link.contains("down&signature=") && !link.contains("tag_showedit"))
// Убиваем веÑ?ÑŒ Ñ?ваÑ?крипт кроме MORE, поднÑ?тиÑ?/опуÑ?каниÑ? поÑ?тов, результатов голоÑ?ованиÑ? и подпиÑ?ки
js.removeAttr("onclick");
}
// Ñ?мена картинок, еÑ?ли автозагрузка выключена
if (!mLoadImages) {
Elements images = resultPage.select("img[src^=http], a:has(img)");
for (Element current : images) {
if (current.tagName().equals("img")) {
String src = current.attr("src");
if (!src.contains("diary.ru") && !current.parent().className().equals("avatar") && !src.startsWith("/")) {
// вÑ?е неподходÑ?щие под критерии изображениÑ? на Ñ?транице будут заменены на кнопки, по клику на которые и будут открыватьÑ?Ñ?
String jsButton = "<input type='image' src='file:///android_asset/images/load_image.png' onclick='return handleIMGDown(this, \"" + src + "\")' />";
current.after(jsButton);
current.remove();
}
}
if (current.tagName().equals("a")) {
String src = current.getElementsByTag("img").attr("src");
if (!src.contains("diary.ru") && !current.parent().className().equals("avatar") && !src.startsWith("/")) {
// вÑ?е неподходÑ?щие под критерии изображениÑ? на Ñ?транице будут заменены на кнопки, по клику на которые и будут открыватьÑ?Ñ?
String jsButton = "<input type='image' src='file:///android_asset/images/load_image.png' onclick='return handleADown(this, \"" + current.attr("href") + "\", \"" + src + "\")' />";
current.after(jsButton);
current.remove();
}
}
}
}
// включаем джаваÑ?крипт
resultPage.body().append(Utils.javascriptContent);
// Ñ?игнатура должна быть видна методам JS
resultPage.body().append("<script>var signature = '" + UserData.getInstance().getSignature() + "';</script>");
}Example 82
| Project: hn-android-master File: BaseHTMLParser.java View source code |
public static String getFirstTextValueInElementChildren(Element element) {
if (element == null)
return "";
for (org.jsoup.nodes.Node node : element.childNodes()) if (node instanceof TextNode)
return ((TextNode) node).text();
return "";
}Example 83
| Project: JibbrJabbr-master File: HtmlResource.java View source code |
@Override
public void head(Node node, int depth) {
}Example 84
| Project: webmagic-master File: CssSelector.java View source code |
protected String getText(Element element) {
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
accum.append(textNode.text());
}
}
return accum.toString();
}Example 85
| Project: karma-exchange-master File: HtmlUtil.java View source code |
// hit when the node is first seen
public void head(Node node, int depth) {
String name = node.nodeName();
if (node instanceof TextNode)
// TextNodes carry all user-readable text in the DOM.
append(((TextNode) node).text());
else if (name.equals("li"))
append("\n * ");
}Example 86
| Project: StartupNews-master File: BaseHTMLParser.java View source code |
public static String getFirstTextValueInElementChildren(Element element) {
if (element == null) {
return "";
}
for (org.jsoup.nodes.Node node : element.childNodes()) {
if (node instanceof TextNode) {
return ((TextNode) node).text();
}
}
return "";
}Example 87
| Project: jenkinsmobi-api-master File: GoogleSsoHandler.java View source code |
private String getDivText(final Element errorDiv) {
for (final Node child : errorDiv.childNodes()) {
if (child instanceof TextNode) {
return ((TextNode) child).getWholeText().trim();
}
}
return "";
}Example 88
| Project: PrepayCredit-master File: HtmlUtilities.java View source code |
private static void removeComments(Node node) { for (int i = 0; i < node.childNodes().size(); ) { Node child = node.childNode(i); if (child.nodeName().equals("#comment")) { child.remove(); } else { removeComments(child); i++; } } }
Example 89
| Project: tika-wrapper-master File: HtmlToPlaintTextSimple.java View source code |
// hit when the node is first seen
public void head(Node node, int depth) {
String name = node.nodeName();
if (node instanceof TextNode)
// TextNodes carry all user-readable text in the DOM.
append(((TextNode) node).text());
else if (name.equals("li"))
append("\n * ");
}Example 90
| Project: apiman-master File: TemplateScanner.java View source code |
/**
* @param element
* @return true if the element doesn't have any child elements
*/
private static boolean hasNoChildren(Element element) {
List<Node> childNodes = element.childNodes();
for (Node node : childNodes) {
if (node instanceof Element) {
return false;
}
}
return true;
}Example 91
| Project: step-master File: RipHomePage.java View source code |
private void removeComments(Node node) { int i = 0; while (i < node.childNodes().size()) { Node child = node.childNode(i); if (child.nodeName().equals("#comment")) child.remove(); else { removeComments(child); i++; } } }