Java Examples for org.jsoup.nodes.TextNode

The following java examples will help you to understand the usage of org.jsoup.nodes.TextNode. These source code samples are taken from different open source projects.

Example 1

Project: OpenLegislation-master File: BillTextTest.java View source code

void processNode(Element ele, StringBuilder stringBuilder) {
    for (Node t : ele.childNodes()) {
        if (t instanceof Element) {
            Element e = (Element) t;
            if (e.tag().getName().equals("u")) {
                stringBuilder.append(e.text().toUpperCase());
                stringBuilder.append("\n");
            } else {
                processNode(e, stringBuilder);
            }
        } else if (t instanceof TextNode) {
            stringBuilder.append(((TextNode) t).text());
            stringBuilder.append("\n");
        }
    }
}

Example 2

Project: cms-ce-master File: HtmlExtractor.java View source code

@Override
public String extractText(final String mimeType, final InputStream inputStream, final String encoding) throws IOException {
    if (!canHandle(mimeType)) {
        return null;
    }
    StringBuilder builder = new StringBuilder();
    Document doc = Jsoup.parse(inputStream, encoding, "");
    for (Element element : doc.getAllElements()) {
        for (TextNode textNode : element.textNodes()) {
            final String text = textNode.text();
            builder.append(text);
            appendWhitespaceAfterTextIfNotThere(builder, text);
        }
    }
    return builder.toString();
}

Example 3

Project: alfresco-apache-storm-demo-master File: JSoupDOMBuilder.java View source code

/**
     * The internal helper that copies content from the specified Jsoup <tt>Node</tt> into a W3C
     * {@link Node}.
     *
     * @param node The Jsoup node containing the content to copy to the specified W3C {@link Node}.
     * @param out The W3C {@link Node} that receives the DOM content.
     */
public static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) {
    if (node instanceof org.jsoup.nodes.Document) {
        org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
        for (org.jsoup.nodes.Node n : d.childNodes()) {
            createDOM(n, out, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.Element) {
        org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
        org.w3c.dom.Element _e = doc.createElement(e.tagName());
        out.appendChild(_e);
        org.jsoup.nodes.Attributes atts = e.attributes();
        for (org.jsoup.nodes.Attribute a : atts) {
            String attName = a.getKey();
            // omit xhtml namespace
            if (attName.equals("xmlns")) {
                continue;
            }
            String attPrefix = getNSPrefix(attName);
            if (attPrefix != null) {
                if (attPrefix.equals("xmlns")) {
                    ns.put(getLocalName(attName), a.getValue());
                } else if (!attPrefix.equals("xml")) {
                    String namespace = ns.get(attPrefix);
                    if (namespace == null) {
                        // fix attribute names looking like qnames
                        attName = attName.replace(':', '_');
                    }
                }
            }
            _e.setAttribute(attName, a.getValue());
        }
        for (org.jsoup.nodes.Node n : e.childNodes()) {
            createDOM(n, _e, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.TextNode) {
        org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
        if (!(out instanceof Document)) {
            out.appendChild(doc.createTextNode(t.text()));
        }
    }
}

Example 4

Project: stanbol-master File: DOMBuilder.java View source code

/**
   * The internal helper that copies content from the specified Jsoup <tt>Node</tt> into a W3C {@link Node}.
   * @param node The Jsoup node containing the content to copy to the specified W3C {@link Node}.
   * @param out The W3C {@link Node} that receives the DOM content.
   */
private static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) {
    if (node instanceof org.jsoup.nodes.Document) {
        org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
        for (org.jsoup.nodes.Node n : d.childNodes()) {
            createDOM(n, out, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.Element) {
        org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
        org.w3c.dom.Element _e = doc.createElement(e.tagName());
        out.appendChild(_e);
        org.jsoup.nodes.Attributes atts = e.attributes();
        for (org.jsoup.nodes.Attribute a : atts) {
            String attName = a.getKey();
            //omit xhtml namespace
            if (attName.equals("xmlns")) {
                continue;
            }
            String attPrefix = getNSPrefix(attName);
            if (attPrefix != null) {
                if (attPrefix.equals("xmlns")) {
                    ns.put(getLocalName(attName), a.getValue());
                } else if (!attPrefix.equals("xml")) {
                    String namespace = ns.get(attPrefix);
                    if (namespace == null) {
                        //fix attribute names looking like qnames
                        attName = attName.replace(':', '_');
                    }
                }
            }
            _e.setAttribute(attName, a.getValue());
        }
        for (org.jsoup.nodes.Node n : e.childNodes()) {
            createDOM(n, _e, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.TextNode) {
        org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
        if (!(out instanceof Document)) {
            out.appendChild(doc.createTextNode(t.text()));
        }
    }
}

Example 5

Project: tori-master File: DOMBuilder.java View source code

/**
     * The internal helper that copies content from the specified Jsoup
     * <tt>Node</tt> into a W3C {@link Node}.
     * 
     * @param node
     *            The Jsoup node containing the content to copy to the specified
     *            W3C {@link Node}.
     * @param out
     *            The W3C {@link Node} that receives the DOM content.
     */
private static void createDOM(final org.jsoup.nodes.Node node, final Node out, final Document doc, final Map<String, String> ns) {
    if (node instanceof org.jsoup.nodes.Document) {
        org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
        for (org.jsoup.nodes.Node n : d.childNodes()) {
            createDOM(n, out, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.Element) {
        org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
        org.w3c.dom.Element _e = doc.createElement(e.tagName());
        out.appendChild(_e);
        org.jsoup.nodes.Attributes atts = e.attributes();
        for (org.jsoup.nodes.Attribute a : atts) {
            String attName = a.getKey();
            // omit xhtml namespace
            if (attName.equals("xmlns")) {
                continue;
            }
            String attPrefix = getNSPrefix(attName);
            if (attPrefix != null) {
                if (attPrefix.equals("xmlns")) {
                    ns.put(getLocalName(attName), a.getValue());
                } else if (!attPrefix.equals("xml")) {
                    String namespace = ns.get(attPrefix);
                    if (namespace == null) {
                        // fix attribute names looking like qnames
                        attName = attName.replace(':', '_');
                    }
                }
            }
            _e.setAttribute(attName, a.getValue());
        }
        for (org.jsoup.nodes.Node n : e.childNodes()) {
            createDOM(n, _e, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.TextNode) {
        org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
        if (!(out instanceof Document)) {
            out.appendChild(doc.createTextNode(t.text()));
        }
    }
}

Example 6

Project: jresponder-master File: TextUtil.java View source code

/* ====================================================================== */
/**
	 * @param cell element that contains whitespace formatting
	 * @return
	 */
public String getWholeText(Element cell) {
    String text = null;
    List<Node> childNodes = cell.childNodes();
    if (childNodes.size() > 0) {
        Node childNode = childNodes.get(0);
        if (childNode instanceof TextNode) {
            text = ((TextNode) childNode).getWholeText();
        }
    }
    if (text == null) {
        text = cell.text();
    }
    return text;
}

Example 7

Project: mbox_tools-master File: HTMLStripUtil.java View source code

@Override
public void head(Node node, int depth) {
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        // non breaking space
        String text = textNode.text().replace(' ', ' ').trim();
        if (!text.isEmpty()) {
            buffer.append(text);
            if (!text.endsWith(" ")) {
                // the last text gets appended the extra space too but we remove it later
                buffer.append(" ");
            }
        }
    }
}

Example 8

Project: mylyn.docs-master File: HtmlCleanerTest.java View source code

@Test
public void testTrailingWhitespaceBodyNoBlock_WhitespaceOutsideBody2() {
    // bug 406943
    Document document = Document.createShell("");
    document.body().appendChild(new TextNode("\n", ""));
    document.body().appendChild(new TextNode("text", ""));
    document.body().appendChild(new TextNode("\n", ""));
    document.body().appendChild(new TextNode("\n", ""));
    String result = cleanToBody(document);
    assertEquals("<body>text</body>", result);
}

Example 9

Project: org.eclipse.mylyn.docs-master File: HtmlCleanerTest.java View source code

@Test
public void testTrailingWhitespaceBodyNoBlock_WhitespaceOutsideBody2() {
    // bug 406943
    Document document = Document.createShell("");
    document.body().appendChild(new TextNode("\n", ""));
    document.body().appendChild(new TextNode("text", ""));
    document.body().appendChild(new TextNode("\n", ""));
    document.body().appendChild(new TextNode("\n", ""));
    String result = cleanToBody(document);
    assertEquals("<body>text</body>", result);
}

Example 10

Project: XCoLab-master File: EmailNotification.java View source code

@Override
protected Node resolvePlaceholderTag(Element tag) {
    final Node node = super.resolvePlaceholderTag(tag);
    if (node != null) {
        return node;
    }
    Contest contest = getContest();
    Proposal proposal = getProposal();
    final boolean hasProposal = contest != null && proposal != null;
    final ContestType contestType = contest != null ? ContestClientUtil.getContestType(contest.getContestTypeId()) : null;
    switch(tag.nodeName()) {
        case COLAB_NAME_PLACEHOLDER:
            return new TextNode(ConfigurationAttributeKey.COLAB_NAME.get(), "");
        case COLAB_URL_PLACEHOLDER:
            return new TextNode(ConfigurationAttributeKey.COLAB_URL.get(), "");
        case COLAB_ADMIN_EMAIL_PLACEHOLDER:
            return new TextNode(ConfigurationAttributeKey.ADMIN_EMAIL.get(), "");
        case FIRSTNAME_PLACEHOLDER:
            return new TextNode(getRecipient().getFirstName(), "");
        case FULL_NAME_PLACEHOLDER:
            return new TextNode(getRecipient().getFullName(), "");
        case CONTEST_LINK_PLACEHOLDER:
            if (contest != null) {
                return parseXmlNode(getContestLink(contest));
            }
            break;
        case PROPOSAL_LINK_PLACEHOLDER:
            if (hasProposal) {
                final String tab = tag.hasAttr("tab") ? tag.attr("tab") : null;
                final String linkText;
                if (StringUtils.isNotBlank(tag.ownText())) {
                    linkText = tag.ownText();
                } else {
                    linkText = getProposalAttributeHelper().getAttributeValueString(ProposalAttributeKeys.NAME, "");
                }
                return parseXmlNode(getProposalLinkWithLinkText(contest, proposal, linkText, tab));
            }
            break;
        case PROPOSAL_STRING_PLACEHOLDER:
            if (contest != null && contestType != null) {
                return new TextNode(contestType.getProposalName(), "");
            }
            break;
        case PROPOSALS_STRING_PLACEHOLDER:
            if (contest != null && contestType != null) {
                return new TextNode(contestType.getProposalNamePlural(), "");
            }
            break;
        case CONTEST_STRING_PLACEHOLDER:
            if (contest != null && contestType != null) {
                return new TextNode(contestType.getContestName(), "");
            }
            break;
        case CONTESTS_STRING_PLACEHOLDER:
            if (contest != null && contestType != null) {
                return new TextNode(contestType.getContestNamePlural(), "");
            }
            break;
        case TWITTER_PLACEHOLDER:
            if (hasProposal) {
                return parseXmlNode(getTwitterShareLink(getProposalLinkUrl(contest, proposal), tag.ownText()));
            }
            break;
        case PINTEREST_PLACEHOLDER:
            if (hasProposal) {
                return parseXmlNode(getPinterestShareLink(getProposalLinkUrl(contest, proposal), tag.ownText()));
            }
            break;
        case FACEBOOK_PLACEHOLDER:
            if (hasProposal) {
                return parseXmlNode(getFacebookShareLink(getProposalLinkUrl(contest, proposal)));
            }
            break;
        case LINKEDIN_PLACEHOLDER:
            if (hasProposal) {
                return parseXmlNode(getLinkedInShareLink(getProposalLinkUrl(contest, proposal), tag.attr("title"), tag.ownText()));
            }
            break;
        default:
    }
    return null;
}

Example 11

Project: iee-master File: TextPadParser.java View source code

@Override
public void head(org.jsoup.nodes.Node node, int depth) {
    INode newNode;
    if (node instanceof org.jsoup.nodes.TextNode) {
        newNode = new Text().setText(((org.jsoup.nodes.TextNode) node).text());
    } else if (node instanceof org.jsoup.nodes.Element) {
        Span span = new Span();
        if (node.hasAttr("style")) {
            TextStyle style = span.getStyle();
            try {
                CSSStyleDeclaration styleDecl = parser.parseStyleDeclaration(new InputSource(new StringReader(node.attr("style"))));
                if ("italic".equals(styleDecl.getPropertyValue("font-style"))) {
                    style.setItalic(true);
                } else {
                    style.setItalic(false);
                }
                if ("bold".equals(styleDecl.getPropertyValue("font-weight"))) {
                    style.setBold(true);
                } else {
                    style.setBold(false);
                }
                if (styleDecl.getPropertyValue("font-family") != null) {
                    style.setFont(styleDecl.getPropertyValue("font-family"));
                }
                if (!Strings.isNullOrEmpty(styleDecl.getPropertyValue("font-size"))) {
                    style.setFontSize(new Integer(styleDecl.getPropertyValue("font-size")));
                }
                if (!Strings.isNullOrEmpty(styleDecl.getPropertyValue("color"))) {
                    RGBColor rgbColorValue = ((CSSPrimitiveValue) styleDecl.getPropertyCSSValue("color")).getRGBColorValue();
                    Color fg = new Color((int) rgbColorValue.getRed().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getGreen().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getBlue().getFloatValue(CSSPrimitiveValue.CSS_NUMBER));
                    style.setFgColor(fg);
                }
                if (!Strings.isNullOrEmpty(styleDecl.getPropertyValue("background-color"))) {
                    RGBColor rgbColorValue = ((CSSPrimitiveValue) styleDecl.getPropertyCSSValue("background-color")).getRGBColorValue();
                    Color bg = new Color((int) rgbColorValue.getRed().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getGreen().getFloatValue(CSSPrimitiveValue.CSS_NUMBER), (int) rgbColorValue.getBlue().getFloatValue(CSSPrimitiveValue.CSS_NUMBER));
                    style.setBgColor(bg);
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        newNode = span;
    } else {
        newNode = new Span();
    }
    stack.push(newNode);
}

Example 12

Project: serverside-elements-master File: ElementReflectHelper.java View source code

public static NodeImpl wrap(org.jsoup.nodes.Node soupNode, Class<? extends Element> elementType) {
    if (soupNode instanceof org.jsoup.nodes.TextNode) {
        return new TextNodeImpl((org.jsoup.nodes.TextNode) soupNode);
    // } else if (soupNode instanceof org.jsoup.nodes.DataNode) {
    // return Elements.createText(((org.jsoup.nodes.DataNode) soupNode)
    // .getWholeData());
    } else if (soupNode instanceof org.jsoup.nodes.Element) {
        org.jsoup.nodes.Element soupElement = (org.jsoup.nodes.Element) soupNode;
        String tag = soupElement.tagName();
        if (elementType == null) {
            elementType = Elements.getRegisteredClass(tag);
        }
        if (elementType == null) {
            return new ElementImpl(soupElement);
        } else {
            return (NodeImpl) wrapElement(elementType, soupElement);
        }
    } else {
        throw new RuntimeException(soupNode.getClass().getName());
    }
}

Example 13

Project: baleen-master File: ParagraphMarkedClassification.java View source code

private void processParagraph(Element p) {
    String text = p.text();
    Matcher matcher = PARAGRAPH_MARKING.matcher(text);
    if (matcher.find()) {
        String classification = matcher.group(CLASSFICATION_GROUP);
        MarkupUtils.additionallyAnnotateAsType(p, "uk.gov.dstl.baleen.types.metadata.ProtectiveMarking");
        // TODO: We override this for simplicity but we could select the best classification etc
        // (or output everything later and let a cleaner decide)
        MarkupUtils.setAttribute(p, "classification", classification.trim());
        // TODO: Ideally delete text the classification from the front.
        // That needs a util as we need to eat up the children of p until we've got to the end.
        // That's quite complex, you'd need to split down the text nodes across multiple children.
        // We'll just remove the the first text node matching the classification we've found as an interim.
        String marking = "(" + classification + ')';
        for (org.jsoup.nodes.TextNode t : p.textNodes()) {
            if (t.text().contains(marking)) {
                String newText = t.text().replace(marking, "");
                t.text(newText);
            }
        }
    }
}

Example 14

Project: java-autolinker-master File: UrlAutoLinkerTest.java View source code

@Test
public void createLinksShouldWork() {
    final UrlAutoLinker autoLinker = new UrlAutoLinker(30);
    List<Node> result;
    Element a;
    result = autoLinker.createLinks(new TextNode("das ist ein test ohne urls", ""));
    Assert.assertTrue(result.size() == 1);
    Assert.assertTrue(result.get(0) instanceof TextNode);
    Assert.assertEquals("das ist ein test ohne urls", ((TextNode) result.get(0)).getWholeText());
    result = autoLinker.createLinks(new TextNode("das ist eine url ohne twitter.com ohne protocoll", ""));
    Assert.assertTrue(result.size() == 3);
    Assert.assertTrue(result.get(0) instanceof TextNode);
    Assert.assertEquals("das ist eine url ohne ", ((TextNode) result.get(0)).getWholeText());
    Assert.assertTrue(result.get(1) instanceof Element);
    a = (Element) result.get(1);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("http://twitter.com", a.attr("href"));
    Assert.assertEquals("http://twitter.com", a.attr("title"));
    Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
    Assert.assertTrue(result.get(2) instanceof TextNode);
    Assert.assertEquals(" ohne protocoll", ((TextNode) result.get(2)).getWholeText());
    result = autoLinker.createLinks(new TextNode("twitter.com ohne protocoll am anfang", ""));
    Assert.assertTrue(result.size() == 2);
    Assert.assertTrue(result.get(0) instanceof Element);
    a = (Element) result.get(0);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("http://twitter.com", a.attr("href"));
    Assert.assertEquals("http://twitter.com", a.attr("title"));
    Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
    Assert.assertTrue(result.get(1) instanceof TextNode);
    Assert.assertEquals(" ohne protocoll am anfang", ((TextNode) result.get(1)).getWholeText());
    result = autoLinker.createLinks(new TextNode("twitter.com ohne protocoll am anfang mit am ende https://dailyfratze.de?foo=bar", ""));
    Assert.assertTrue(result.size() == 3);
    Assert.assertTrue(result.get(0) instanceof Element);
    a = (Element) result.get(0);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("http://twitter.com", a.attr("href"));
    Assert.assertEquals("http://twitter.com", a.attr("title"));
    Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
    Assert.assertTrue(result.get(1) instanceof TextNode);
    Assert.assertEquals(" ohne protocoll am anfang mit am ende ", ((TextNode) result.get(1)).getWholeText());
    Assert.assertTrue(result.get(2) instanceof Element);
    a = (Element) result.get(2);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("href"));
    Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("title"));
    Assert.assertEquals("dailyfratze.de?foo=bar", ((TextNode) a.childNode(0)).getWholeText());
    Assert.assertTrue(result.get(2) instanceof Element);
    a = (Element) result.get(2);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("href"));
    Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("title"));
    Assert.assertEquals("dailyfratze.de?foo=bar", ((TextNode) a.childNode(0)).getWholeText());
    result = autoLinker.createLinks(new TextNode("das ist eine url ohne https://dailyfratze.de/app/tags/CoStarring/Anton#taggedPictures ohne protocoll", ""));
    Assert.assertTrue(result.get(1) instanceof Element);
    a = (Element) result.get(1);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("https://dailyfratze.de/app/tags/CoStarring/Anton#taggedPictures", a.attr("href"));
    Assert.assertEquals("https://dailyfratze.de/app/tags/CoStarring/Anton#taggedPictures", a.attr("title"));
    Assert.assertEquals("dailyfratze.de/app/tags/CoStaâ€¦", ((TextNode) a.childNode(0)).getWholeText());
    result = autoLinker.createLinks(new TextNode("  twitter.com ohne protocoll am anfang mit am ende https://dailyfratze.de?foo=bar  ", ""));
    Assert.assertTrue(result.size() == 5);
    Assert.assertTrue(result.get(0) instanceof TextNode);
    Assert.assertEquals("  ", ((TextNode) result.get(0)).getWholeText());
    Assert.assertTrue(result.get(1) instanceof Element);
    a = (Element) result.get(1);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("http://twitter.com", a.attr("href"));
    Assert.assertEquals("http://twitter.com", a.attr("title"));
    Assert.assertEquals("twitter.com", ((TextNode) a.childNode(0)).getWholeText());
    Assert.assertTrue(result.get(2) instanceof TextNode);
    Assert.assertEquals(" ohne protocoll am anfang mit am ende ", ((TextNode) result.get(2)).getWholeText());
    Assert.assertTrue(result.get(3) instanceof Element);
    a = (Element) result.get(3);
    Assert.assertEquals("a", a.tagName());
    Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("href"));
    Assert.assertEquals("https://dailyfratze.de?foo=bar", a.attr("title"));
    Assert.assertEquals("dailyfratze.de?foo=bar", ((TextNode) a.childNode(0)).getWholeText());
    Assert.assertTrue(result.get(4) instanceof TextNode);
    Assert.assertEquals("  ", ((TextNode) result.get(4)).getWholeText());
}

Example 15

Project: mechanize-master File: HtmlElements.java View source code

public HtmlNode getHtmlNode(org.jsoup.nodes.Node node) {
    if (elementCache.containsKey(node)) {
        return elementCache.get(node);
    } else {
        HtmlNode htmlNode = null;
        if (node instanceof Element)
            htmlNode = new HtmlElement(page, (Element) node);
        else if (node instanceof TextNode)
            htmlNode = new HtmlTextNode(page, (TextNode) node);
        else
            htmlNode = new HtmlNode(page, node);
        elementCache.put(node, htmlNode);
        return htmlNode;
    }
}

Example 16

Project: Tanaguru-master File: DeepTextElementBuilder.java View source code

@Override
public String buildTextFromElement(Element element) {
    StringBuilder elementText = new StringBuilder();
    if (element.hasAttr(ALT_ATTR)) {
        elementText.append(SPACER);
        elementText.append(altAttrTextBuilder.buildTextFromElement(element));
    }
    for (Node child : element.childNodes()) {
        if (child instanceof TextNode && !((TextNode) child).isBlank()) {
            elementText.append(SPACER);
            elementText.append(StringUtils.trim(((TextNode) child).text()));
        } else if (child instanceof Element) {
            elementText.append(SPACER);
            elementText.append(buildTextFromElement((Element) child));
        }
    }
    return StringUtils.trim(elementText.toString());
}

Example 17

Project: jsoup-master File: XmlTreeBuilderTest.java View source code

@Test
public void xmlFragment() {
    String xml = "<one src='/foo/' />Two<three><four /></three>";
    List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
    assertEquals(3, nodes.size());
    assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
    assertEquals("one", nodes.get(0).nodeName());
    assertEquals("Two", ((TextNode) nodes.get(1)).text());
}

Example 18

Project: storm-crawler-master File: JSoupDOMBuilder.java View source code

/**
     * The internal helper that copies content from the specified Jsoup
     * <tt>Node</tt> into a W3C {@link Node}.
     * 
     * @param node
     *            The Jsoup node containing the content to copy to the specified
     *            W3C {@link Node}.
     * @param out
     *            The W3C {@link Node} that receives the DOM content.
     */
public static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) {
    if (node instanceof org.jsoup.nodes.Document) {
        org.jsoup.nodes.Document d = (org.jsoup.nodes.Document) node;
        for (org.jsoup.nodes.Node n : d.childNodes()) {
            createDOM(n, out, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.Element) {
        org.jsoup.nodes.Element e = (org.jsoup.nodes.Element) node;
        org.w3c.dom.Element _e = doc.createElement(e.tagName());
        out.appendChild(_e);
        org.jsoup.nodes.Attributes atts = e.attributes();
        for (org.jsoup.nodes.Attribute a : atts) {
            String attName = a.getKey();
            // omit xhtml namespace
            if (attName.equals("xmlns")) {
                continue;
            }
            String attPrefix = getNSPrefix(attName);
            if (attPrefix != null) {
                if (attPrefix.equals("xmlns")) {
                    ns.put(getLocalName(attName), a.getValue());
                } else if (!attPrefix.equals("xml")) {
                    String namespace = ns.get(attPrefix);
                    if (namespace == null) {
                        // fix attribute names looking like qnames
                        attName = attName.replace(':', '_');
                    }
                }
            }
            _e.setAttribute(attName, a.getValue());
        }
        for (org.jsoup.nodes.Node n : e.childNodes()) {
            createDOM(n, _e, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.TextNode) {
        org.jsoup.nodes.TextNode t = (org.jsoup.nodes.TextNode) node;
        if (!(out instanceof Document)) {
            out.appendChild(doc.createTextNode(t.text()));
        }
    } else if (node instanceof org.jsoup.nodes.Comment) {
        if (!(out instanceof Document)) {
            org.jsoup.nodes.Comment comment = (org.jsoup.nodes.Comment) node;
            out.appendChild(doc.createComment(comment.getData()));
        }
    } else if (node instanceof org.jsoup.nodes.DataNode) {
        if (!(out instanceof Document)) {
            org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) node;
            String whole = sourceData.getWholeData();
            out.appendChild(doc.createTextNode(whole));
        }
    }
}

Example 19

Project: web-crawler-master File: JSoupDOMBuilder.java View source code

/**
     * The internal helper that copies content from the specified Jsoup
     * <tt>Node</tt> into a W3C {@link Node}.
     * 
     * @param node
     *            The Jsoup node containing the content to copy to the specified
     *            W3C {@link Node}.
     * @param out
     *            The W3C {@link Node} that receives the DOM content.
     */
public static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) {
    if (node instanceof org.jsoup.nodes.Document) {
        org.jsoup.nodes.Document d = (org.jsoup.nodes.Document) node;
        for (org.jsoup.nodes.Node n : d.childNodes()) {
            createDOM(n, out, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.Element) {
        org.jsoup.nodes.Element e = (org.jsoup.nodes.Element) node;
        org.w3c.dom.Element _e = doc.createElement(e.tagName());
        out.appendChild(_e);
        org.jsoup.nodes.Attributes atts = e.attributes();
        for (org.jsoup.nodes.Attribute a : atts) {
            String attName = a.getKey();
            // omit xhtml namespace
            if (attName.equals("xmlns")) {
                continue;
            }
            String attPrefix = getNSPrefix(attName);
            if (attPrefix != null) {
                if (attPrefix.equals("xmlns")) {
                    ns.put(getLocalName(attName), a.getValue());
                } else if (!attPrefix.equals("xml")) {
                    String namespace = ns.get(attPrefix);
                    if (namespace == null) {
                        // fix attribute names looking like qnames
                        attName = attName.replace(':', '_');
                    }
                }
            }
            _e.setAttribute(attName, a.getValue());
        }
        for (org.jsoup.nodes.Node n : e.childNodes()) {
            createDOM(n, _e, doc, ns);
        }
    } else if (node instanceof org.jsoup.nodes.TextNode) {
        org.jsoup.nodes.TextNode t = (org.jsoup.nodes.TextNode) node;
        if (!(out instanceof Document)) {
            out.appendChild(doc.createTextNode(t.text()));
        }
    } else if (node instanceof org.jsoup.nodes.Comment) {
        if (!(out instanceof Document)) {
            org.jsoup.nodes.Comment comment = (org.jsoup.nodes.Comment) node;
            out.appendChild(doc.createComment(comment.getData()));
        }
    } else if (node instanceof org.jsoup.nodes.DataNode) {
        if (!(out instanceof Document)) {
            org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) node;
            String whole = sourceData.getWholeData();
            out.appendChild(doc.createTextNode(whole));
        }
    }
}

Example 20

Project: facelets-lite-master File: Test.java View source code

String toNormalHtml(Document doc) {
    doc.normalise();
    doc.traverse(new NodeVisitor() {

        @Override
        public void tail(Node node, int depth) {
            if (node instanceof TextNode) {
                TextNode textNode = (TextNode) node;
                textNode.text(textNode.text().trim());
            }
        }

        @Override
        public void head(Node arg0, int arg1) {
        }
    });
    return cleaner.clean(doc).html();
}

Example 21

Project: framework-master File: DeclarativeTestBaseBase.java View source code

/**
     * Produce predictable html (attributes in alphabetical order), always
     * include close tags
     */
private String elementToHtml(Element producedElem, StringBuilder sb) {
    HashSet<String> booleanAttributes = new HashSet<>();
    ArrayList<String> names = new ArrayList<>();
    for (Attribute a : producedElem.attributes().asList()) {
        names.add(a.getKey());
        if (a instanceof BooleanAttribute) {
            booleanAttributes.add(a.getKey());
        }
    }
    Collections.sort(names);
    sb.append("<").append(producedElem.tagName()).append("");
    for (String attrName : names) {
        sb.append(" ").append(attrName);
        if (!booleanAttributes.contains(attrName)) {
            sb.append("=").append("\'").append(producedElem.attr(attrName)).append("\'");
        }
    }
    sb.append(">");
    for (Node child : producedElem.childNodes()) {
        if (child instanceof Element) {
            elementToHtml((Element) child, sb);
        } else if (child instanceof TextNode) {
            String text = ((TextNode) child).text();
            sb.append(text.trim());
        }
    }
    sb.append("</").append(producedElem.tagName()).append(">");
    return sb.toString();
}

Example 22

Project: jbehave-core-master File: LoadFromConfluence.java View source code

protected void cleanNodes(Element body, String tag) {
    for (Element element : body.getElementsByTag(tag)) {
        if (element == null || element.parent() == null) {
            continue;
        }
        for (Element child : element.children().select(tag)) {
            cleanNodes(child, tag);
        }
        element.replaceWith(new TextNode(element.text() + "<br/>", ""));
    }
}

Example 23

Project: jinjava-master File: TruncateHtmlFilter.java View source code

@Override
public void head(Node node, int depth) {
    if (node instanceof TextNode) {
        TextNode text = (TextNode) node;
        String textContent = text.text();
        if (textLen >= maxTextLen) {
            text.text("");
        } else if (textLen + textContent.length() > maxTextLen) {
            int ptr = maxTextLen - textLen;
            if (!killwords) {
                ptr = Functions.movePointerToJustBeforeLastWord(ptr, textContent) - 1;
            }
            text.text(textContent.substring(0, ptr) + ending);
            textLen = maxTextLen;
        } else {
            textLen += textContent.length();
        }
    }
}

Example 24

Project: kune-master File: ContentUnrenderer.java View source code

// private static final Logger LOG =
// Logger.getLogger(ContentUnrenderer.class.getName());
/**
   * Helper method to recursively parse a HTML element and construct a wave
   * document.
   *
   * @param parent the parent
   * @param output the output
   * @param elements the elements
   * @param annotations the annotations
   */
private static void unrender(final Node parent, final StringBuilder output, final Map<Integer, com.google.wave.api.Element> elements, final Annotations annotations) {
    for (final Node node : parent.childNodes()) {
        if (node instanceof TextNode) {
            output.append(((TextNode) node).text());
        } else if (node instanceof Element) {
            final int position = output.length();
            final Element element = (Element) node;
            final String name = element.tag().getName();
            if ("p".equalsIgnoreCase(name)) {
                elements.put(position, new Line());
            // handle any attributes?
            }
            // Additional HTML element tags here.
            unrender(element, output, elements, annotations);
        }
    }
}

Example 25

Project: SMSnatcher-master File: LyricWikiScraper.java View source code

public static String getLyrics(String artist, String title) {
    // Prepare artist and title for LyricWiki's URL format
    artist = artist.replace(' ', '_');
    String mod_title = title.replace(' ', '_');
    Logger.LogToStatusBar("Getting lyrics (" + artist + " : " + mod_title + ")!");
    String url = "http://lyrics.wikia.com/" + artist + ":" + mod_title;
    Logger.LogToStatusBar(url);
    String lyrics = "";
    // Try to load page using Jsoup
    try {
        // Load page into Document
        Document doc = Jsoup.connect(url).get();
        // Get lyricBox from page
        Elements lyricBox = doc.select("div.lyricbox");
        //System.out.println(lyricBox.hasText());
        if (!lyricBox.hasText()) {
            Logger.LogToStatusBar("Lyrics not found!");
            return "";
        }
        // Remove ads and junk
        lyricBox.get(0).select("div.rtMatcher").remove();
        lyricBox.get(0).select("div.lyricsbreak").remove();
        // Remove comments
        ParseUtils.removeComments(lyricBox.get(0));
        // We now have almost perfect lyrics.
        lyrics = lyricBox.get(0).html();
        TextNode t = TextNode.createFromEncoded(lyrics, "lyricwiki");
        lyrics = t.getWholeText();
        //System.out.println(lyrics);
        //Remove minimal HTML tags, leaving newlines intact
        lyrics = lyrics.replaceAll("<br />", "");
        lyrics = lyrics.replaceAll("<i>", "");
        lyrics = lyrics.replaceAll("</i>", "");
        lyrics = lyrics.replaceAll("<b>", "");
        lyrics = lyrics.replaceAll("</b>", "");
        lyrics = lyrics.replaceAll("<p>", "");
        lyrics = lyrics.replaceAll("</p>", "");
        lyrics = lyrics.replaceAll("<", "<");
        lyrics = lyrics.replaceAll(">", ">");
        lyrics = lyrics.replaceAll("ï¿½", "\'");
        // Check if LyricWiki has full lyrics or only portion
        if (lyrics.contains("we are not licensed to display the full lyrics")) {
            return "";
        } else if (lyricBox.get(0).select("a").attr("title").contains("Instrumental")) {
            return "Instrumental";
        }
    } catch (IOException e) {
        Logger.LogToStatusBar("Lyrics not found!");
    }
    Logger.LogToStatusBar("Done");
    return lyrics;
}

Example 26

Project: vaadin-master File: DeclarativeTestBaseBase.java View source code

/**
     * Produce predictable html (attributes in alphabetical order), always
     * include close tags
     */
private String elementToHtml(Element producedElem, StringBuilder sb) {
    HashSet<String> booleanAttributes = new HashSet<>();
    ArrayList<String> names = new ArrayList<>();
    for (Attribute a : producedElem.attributes().asList()) {
        names.add(a.getKey());
        if (a instanceof BooleanAttribute) {
            booleanAttributes.add(a.getKey());
        }
    }
    Collections.sort(names);
    sb.append("<").append(producedElem.tagName()).append("");
    for (String attrName : names) {
        sb.append(" ").append(attrName);
        if (!booleanAttributes.contains(attrName)) {
            sb.append("=").append("\'").append(producedElem.attr(attrName)).append("\'");
        }
    }
    sb.append(">");
    for (Node child : producedElem.childNodes()) {
        if (child instanceof Element) {
            elementToHtml((Element) child, sb);
        } else if (child instanceof TextNode) {
            String text = ((TextNode) child).text();
            sb.append(text.trim());
        }
    }
    sb.append("</").append(producedElem.tagName()).append(">");
    return sb.toString();
}

Example 27

Project: zongtui-webcrawler-master File: ElementOperator.java View source code

@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}

Example 28

Project: Vega-master File: NodeImpl.java View source code

static NodeImpl createFromJsoupNode(org.jsoup.nodes.Node node, Document ownerDocument) {
    if (node == null)
        return null;
    else if (node instanceof org.jsoup.nodes.Element)
        return HTMLElementImpl.create((Element) node, ownerDocument);
    else if (node instanceof org.jsoup.nodes.TextNode)
        return new TextImpl((org.jsoup.nodes.TextNode) node, ownerDocument);
    else if (node instanceof org.jsoup.nodes.Comment)
        return new CommentImpl((org.jsoup.nodes.Comment) node, ownerDocument);
    else if (node instanceof org.jsoup.nodes.DataNode)
        return new CharacterDataImpl((DataNode) node, ((DataNode) node).getWholeData(), ownerDocument);
    else
        return new NodeImpl(node, ownerDocument);
}

Example 29

Project: android-essentials-toolbox-master File: GenerateUndocumentedPermissions.java View source code

/**
	 * Searches for the preceeding sibling level comment before the given xml permission element
	 * @param permissionElement
	 * @return
	 */
private static org.jsoup.nodes.Comment getPreceedingComment(org.jsoup.nodes.Element permissionElement) {
    org.jsoup.nodes.Node node = permissionElement;
    while (true) {
        node = node.previousSibling();
        if (node instanceof Comment) {
            return (org.jsoup.nodes.Comment) node;
        } else if (node instanceof org.jsoup.nodes.TextNode) {
            // important, there is a trailing whitespace character after the comment that is considered as a node
            continue;
        } else if (node instanceof org.jsoup.nodes.Element) {
            return null;
        }
    }
}

Example 30

Project: bavrd-core-master File: Face.java View source code

private String sanitize(Node n) {
    String output;
    if (n instanceof Element) {
        StringBuffer inner = new StringBuffer();
        for (Node child : n.childNodes()) {
            inner.append(sanitize(child));
        }
        String text = inner.toString();
        Element e = (Element) n;
        if (e.tagName().equals("b")) {
            output = formatBold(text);
        } else if (e.tagName().equals("br")) {
            output = formatNewLine();
        } else if (e.tagName().equals("i")) {
            output = formatItalic(text);
        } else if (e.tagName().equals("code")) {
            output = formatCode(text);
        } else if (e.tagName().equals("img")) {
            output = formatImg(e.attr("abs:src"), e.attr("alt"));
        } else {
            output = text;
        }
    } else if (n instanceof TextNode) {
        output = ((TextNode) n).text();
    } else
        output = "";
    //jsoup tends to add some whitespaces before and after <br>, let's get rid of them
    if (n.nextSibling() instanceof Element && ((Element) n.nextSibling()).tagName().equals("br"))
        output = output.replaceFirst("\\s+$", "");
    if (n.previousSibling() instanceof Element && ((Element) n.previousSibling()).tagName().equals("br"))
        output = output.replaceFirst("^\\s+", "");
    return output;
}

Example 31

Project: CN1ML-NetbeansModule-master File: XmlTreeBuilderTest.java View source code

@Test
public void xmlFragment() {
    String xml = "<one src='/foo/' />Two<three><four /></three>";
    List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
    assertEquals(3, nodes.size());
    assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
    assertEquals("one", nodes.get(0).nodeName());
    assertEquals("Two", ((TextNode) nodes.get(1)).text());
}

Example 32

Project: link-bubble-master File: OutputFormatter.java View source code

void appendTextSkipHidden(Element e, StringBuilder accum) {
    for (Node child : e.childNodes()) {
        if (unlikely(child))
            continue;
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
                accum.append(" ");
            else if (element.tagName().equals("br"))
                accum.append(" ");
            appendTextSkipHidden(element, accum);
        }
    }
}

Example 33

Project: open-data-service-master File: PegelPortalMvSourceAdapter.java View source code

private String extractText(Element element) {
    StringBuilder builder = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            builder.append(node.toString());
        } else if (node instanceof Element) {
            builder.append(extractText((Element) node));
        }
    }
    return builder.toString();
}

Example 34

Project: structured-content-tools-master File: StripHtmlPreprocessor.java View source code

@Override
public void head(Node node, int depth) {
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        // non breaking space
        String text = textNode.text().replace(' ', ' ').trim();
        if (!text.isEmpty()) {
            buffer.append(text);
            if (!text.endsWith(" ")) {
                // the last text gets appended the extra space too but we remove it later
                buffer.append(" ");
            }
        }
    }
}

Example 35

Project: Vaadin-SignatureField-master File: DeclarativeTestBaseBase.java View source code

/**
     * Produce predictable html (attributes in alphabetical order), always
     * include close tags
     */
private String elementToHtml(Element producedElem, StringBuilder sb) {
    ArrayList<String> names = new ArrayList<String>();
    for (Attribute a : producedElem.attributes().asList()) {
        names.add(a.getKey());
    }
    Collections.sort(names);
    sb.append("<" + producedElem.tagName() + "");
    for (String attrName : names) {
        sb.append(" ").append(attrName).append("=").append("\'").append(producedElem.attr(attrName)).append("\'");
    }
    sb.append(">");
    for (Node child : producedElem.childNodes()) {
        if (child instanceof Element) {
            elementToHtml((Element) child, sb);
        } else if (child instanceof TextNode) {
            String text = ((TextNode) child).text();
            sb.append(text.trim());
        }
    }
    sb.append("</").append(producedElem.tagName()).append(">");
    return sb.toString();
}

Example 36

Project: aMatch-master File: QuestionSearch.java View source code

// hit when the node is first seen
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        // TextNodes carry all user-readable text in the DOM.
        append(((TextNode) node).text());
    else if (name.equals("li"))
        append("\n * ");
    else if (name.equals("dt"))
        append("  ");
    else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "tr"))
        //            else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))
        append("\n");
}

Example 37

Project: Android_RssReader-master File: Readability.java View source code

//    private static String GetArticleTitle(Element htmlNode)
//    {        
//        if (htmlNode.getElementsByTag("title") == null) 
//        	return null;
//        
//        Element titleNode = htmlNode.getElementsByTag("title").get(0);
//
//        String currTitle, origTitle;
//        currTitle = origTitle = GetInnerText(titleNode);
//
//        if (Regex.IsMatch(currTitle, @" [\|\-] "))
//        {
//            currTitle = Regex.Replace(origTitle,  @"(.*)[\|\-] .*", "$1");
//
//            if (currTitle.Split(' ').Length < 3)
//            {
//                currTitle = origTitle.Replace(@"[^\|\-]*[\|\-](.*)", "$1");
//            }
//        }
//        else if (currTitle.IndexOf(": ") != -1)
//        {
//            currTitle = Regex.Replace(origTitle, @".*:(.*)", "$1");
//
//            if(currTitle.Split(' ').Length < 3)
//            {
//                currTitle = Regex.Replace(origTitle, @"[^:]*[:](.*)", "$1");
//            }
//        }
//        else if (currTitle.Length > 150 || currTitle.Length < 15)
//        {
//            var hOnes = htmlNode.GetElementsByTagName("h1");
//            if (hOnes.Count == 1)
//            {
//                currTitle = GetInnerText(hOnes[0]);
//            }
//        }
//
//        if (currTitle.Split(' ').Length <= 4)
//        {
//            currTitle = origTitle;
//        }
//    
//        return currTitle.Trim();
//    }
private static String GetArticleContent(Document doc) {
    Element body = doc.body();
    List<Element> allElements = body.getAllElements();
    List<Element> nodesToScore = new ArrayList<Element>();
    for (int nodeIndex = 0, len = allElements.size(); nodeIndex < len; nodeIndex++) {
        Element node = allElements.get(nodeIndex);
        String unlikelyMatchString = node.hasAttr("class") ? node.attr("class") : "" + node.attr("id");
        if (s_unlikelyCandidates.matcher(unlikelyMatchString).find() && !s_okMaybeItsACandidate.matcher(unlikelyMatchString).find() && !node.nodeName().equals("body") && !node.nodeName().equals("html") && !node.nodeName().equals("head")) {
            node.remove();
            continue;
        }
        if (node.nodeName().equals("p") || node.nodeName().equals("td") || node.nodeName().equals("pre")) {
            nodesToScore.add(node);
        }
        if (node.nodeName().equals("div")) {
            if (!s_divToPElements.matcher(node.html()).find()) {
                if (node.ownerDocument() != null) {
                    Element newNode = node.ownerDocument().createElement("p");
                    newNode.html(node.html());
                    node.replaceWith(newNode);
                    nodesToScore.add(newNode);
                }
            } else {
                for (Node childNode : node.childNodes()) {
                    if (childNode instanceof TextNode) {
                        if (node.ownerDocument() != null) {
                            Element p = node.ownerDocument().createElement("p");
                            p.html(((TextNode) childNode).text());
                            childNode.replaceWith(p);
                        }
                    }
                }
            }
        }
    }
    Map<Element, Integer> scores = new HashMap<Element, Integer>();
    List<Element> candidates = new ArrayList<Element>();
    for (int pt = 0, len = nodesToScore.size(); pt < len; pt++) {
        Element parentNode = nodesToScore.get(pt).parent();
        Element grandParentNode = parentNode != null ? parentNode.parent() : null;
        String innerText = GetInnerText(nodesToScore.get(pt));
        if (parentNode == null)
            continue;
        if (parentNode.nodeName().equals("body"))
            continue;
        if (parentNode.nodeName().equals("html"))
            continue;
        if (parentNode.nodeName().equals("footer"))
            continue;
        if (parentNode != null && parentNode.hasAttr("class") && parentNode.attr("class").equals("copyright"))
            continue;
        if (innerText.length() < 25)
            continue;
        if (!scores.containsKey(parentNode)) {
            scores.put(parentNode, CalculateNodeScore(parentNode));
            candidates.add(parentNode);
        }
        if (grandParentNode != null && !scores.containsKey(grandParentNode)) {
            scores.put(grandParentNode, CalculateNodeScore(grandParentNode));
            candidates.add(grandParentNode);
        }
        int contentScore = 0;
        contentScore++;
        //for embed flash case
        if (innerText.contains("embed") && (innerText.contains("youku") || innerText.contains("tudou") || innerText.contains("ku6") || innerText.contains("sohu") || innerText.contains("weiphone") || innerText.contains("56") || innerText.contains("youtube") || innerText.contains("qq")))
            contentScore += 50;
        contentScore += innerText.split("[,]|[，]").length;
        contentScore += Math.min(innerText.length() / 100, 3);
        int v = scores.get(parentNode);
        v += contentScore;
        scores.put(parentNode, v);
        if (grandParentNode != null) {
            v = scores.get(grandParentNode);
            v += contentScore / 2;
            scores.put(grandParentNode, v);
        }
    }
    Element topCandidate = null;
    for (Element cand : candidates) {
        int v = scores.get(cand);
        v = (int) (v * (1 - GetLinkDensity(cand)));
        scores.put(cand, v);
        if (topCandidate == null || scores.get(cand) > scores.get(topCandidate)) {
            topCandidate = cand;
        }
        if (topCandidate == null || topCandidate.nodeName().equals("body")) {
            topCandidate = doc.createElement("div");
            topCandidate.html(body.html());
            body.html("");
            body.appendChild(topCandidate);
            scores.put(topCandidate, CalculateNodeScore(topCandidate));
        }
    }
    return topCandidate == null ? null : topCandidate.html();
}

Example 38

Project: anewjkuapp-master File: FeedEntryImpl.java View source code

@Override
public String getShortDescription() {
    String shortDescr = htmlToStr(getDescription());
    try {
        Document doc = Jsoup.parse(shortDescr);
        Element body = doc.body();
        if (body != null) {
            List<TextNode> textNodes = body.textNodes();
            if (textNodes.size() > 0) {
                shortDescr = textNodes.get(0).getWholeText();
            } else {
                List<Element> children = body.children();
                if (children.size() > 0) {
                    shortDescr = children.get(0).text();
                } else {
                    shortDescr = doc.text();
                }
            }
        } else {
            shortDescr = doc.text();
        }
        shortDescr = shortDescr.trim();
        Pattern p = Pattern.compile("(\\D\\.|\\?|\\!)(\\s+)");
        Matcher m = p.matcher(shortDescr);
        if (m.find()) {
            shortDescr = shortDescr.substring(0, m.end());
        }
        if (shortDescr.length() > 350) {
            shortDescr = shortDescr.substring(0, 175).trim() + "...";
        }
    } catch (Exception e) {
        Log.e(getClass().getSimpleName(), "gsd failed", e);
    }
    return shortDescr.trim();
}

Example 39

Project: brightspot-cms-master File: RichTextViewBuilder.java View source code

// Traverses the siblings all the way down the tree, collapsing balanced
// blocks of HTML that do NOT contain any rich text elements into a single
// HTML string.
private List<RichTextViewNode<V>> toViewNodes(List<Node> siblings) {
    List<RichTextViewNode<V>> viewNodes = new ArrayList<>();
    for (Node sibling : siblings) {
        if (sibling instanceof Element) {
            Element element = (Element) sibling;
            RichTextElement rte = RichTextElement.fromElement(element);
            ObjectType tagType = rte != null ? rte.getState().getType() : null;
            if (rte != null && elementToView != null) {
                viewNodes.add(new ElementRichTextViewNode<>(rte, elementToView));
            } else if (tagType == null || keepUnboundElements) {
                List<RichTextViewNode<V>> childViewNodes = toViewNodes(element.childNodes());
                String html = element.outerHtml();
                if (element.tag().isSelfClosing()) {
                    viewNodes.add(new StringRichTextViewNode<>(html, htmlToView));
                } else {
                    int firstGtAt = html.indexOf('>');
                    int lastLtAt = html.lastIndexOf('<');
                    // This deliberately does not validate the index values
                    // above, since non-self-closing element should always
                    // have those characters present in the HTML.
                    viewNodes.add(new StringRichTextViewNode<>(html.substring(0, firstGtAt + 1), htmlToView));
                    viewNodes.addAll(childViewNodes);
                    viewNodes.add(new StringRichTextViewNode<>(html.substring(lastLtAt), htmlToView));
                }
            }
        } else if (sibling instanceof TextNode) {
            viewNodes.add(new StringRichTextViewNode<>(((TextNode) sibling).text(), htmlToView));
        } else if (sibling instanceof DataNode) {
            viewNodes.add(new StringRichTextViewNode<>(((DataNode) sibling).getWholeData(), htmlToView));
        }
    }
    // Collapse the nodes as much as possible.
    List<RichTextViewNode<V>> collapsed = new ArrayList<>();
    List<StringRichTextViewNode<V>> adjacent = new ArrayList<>();
    for (RichTextViewNode<V> childBuilderNode : viewNodes) {
        if (childBuilderNode instanceof StringRichTextViewNode) {
            adjacent.add((StringRichTextViewNode<V>) childBuilderNode);
        } else {
            collapsed.add(new StringRichTextViewNode<>(adjacent.stream().map(StringRichTextViewNode::getHtml).collect(Collectors.joining()), htmlToView));
            adjacent.clear();
            collapsed.add(childBuilderNode);
        }
    }
    if (!adjacent.isEmpty()) {
        collapsed.add(new StringRichTextViewNode<>(adjacent.stream().map(StringRichTextViewNode::getHtml).collect(Collectors.joining()), htmlToView));
    }
    return collapsed;
}

Example 40

Project: dogeared-extruder-master File: Readability.java View source code

// CHECKSTYLE:OFF
private Element grabArticle(Element pageElement) {
    boolean isPaging = pageElement != null;
    if (pageElement == null) {
        pageElement = body;
    }
    String pageCacheHtml = pageElement.html();
    Elements allElements = pageElement.getAllElements();
    /*
         * Note: in Javascript, this list would be *live*. If you deleted a node from the tree, it and its
         * children would remove themselves. To get the same effect, we make a linked list and we remove
         * things from it. This won't win prizes for speed, but, then again, the code in Javascript has to be
         * doing something nearly as awful.
         */
    LinkedList<Element> allElementsList = new LinkedList<Element>();
    allElementsList.addAll(allElements);
    /**
         * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc),
         * and turn divs into P tags where they have been used inappropriately (as in, where they contain no
         * other block level elements.) Note: Assignment from index for performance. See
         * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse
         * traversal?
         **/
    List<Element> nodesToScore = new ArrayList<Element>();
    ListIterator<Element> elIterator = allElementsList.listIterator();
    Set<Element> goodAsDead = new HashSet<Element>();
    while (elIterator.hasNext()) {
        Element node = elIterator.next();
        if (goodAsDead.contains(node)) {
            continue;
        }
        /* Remove unlikely candidates */
        if (stripUnlikelyCandidates) {
            String unlikelyMatchString = node.className() + node.id();
            if (Patterns.exists(Patterns.UNLIKELY_CANDIDATES, unlikelyMatchString) && !Patterns.exists(Patterns.OK_MAYBE_ITS_A_CANDIDATE, unlikelyMatchString) && !"body".equals(node.tagName())) {
                LOG.debug("Removing unlikely candidate - " + unlikelyMatchString);
                List<Element> toRemoveAndBelow = node.getAllElements();
                elIterator.remove();
                /*
                     * adding 'node' to that set is harmless and reduces the code complexity here.
                     */
                goodAsDead.addAll(toRemoveAndBelow);
                continue;
            }
        }
        if ("p".equals(node.tagName()) || "td".equals(node.tagName()) || "pre".equals(node.tagName())) {
            nodesToScore.add(node);
        }
        /*
             * Turn all divs that don't have children block level elements into p's
             */
        if ("div".equals(node.tagName())) {
            boolean hasBlock = false;
            for (Element divChild : node.getAllElements()) {
                if (divChild != node) {
                    if (DIV_TO_P_ELEMENTS.contains(divChild.tagName())) {
                        hasBlock = true;
                        break;
                    }
                }
            }
            if (!hasBlock) {
                Element newElement = changeElementTag(node, "p");
                nodesToScore.remove(node);
                nodesToScore.add(newElement);
            } else {
                /* EXPERIMENTAL */
                //*
                int limit = node.childNodes().size();
                for (int i = 0; i < limit; i++) {
                    Node childNode = node.childNodes().get(i);
                    if (childNode instanceof TextNode) {
                        Element p = document.createElement("p");
                        p.attr("basisInline", "true");
                        p.html(((TextNode) childNode).text());
                        childNode.replaceWith(p);
                    }
                }
            }
        }
    }
    /**
         * Loop through all paragraphs, and assign a score to them based on how content-y they look. Then add
         * their score to their parent node. A score is determined by things like number of commas, class
         * names, etc. Maybe eventually link density.
         **/
    List<Element> candidates = new ArrayList<Element>();
    for (Element nodeToScore : nodesToScore) {
        Element parentNode = nodeToScore.parent();
        if (null == parentNode) {
            // dropped previously.
            continue;
        }
        Element grandParentNode = parentNode.parent();
        if (grandParentNode == null) {
            // ditto
            continue;
        }
        String innerText = nodeToScore.text();
        /*
             * If this paragraph is less than 25 characters, don't even count it.
             */
        if (innerText.length() < 25) {
            continue;
        }
        /* Initialize readability data for the parent. */
        if ("".equals(parentNode.attr("readability"))) {
            initializeNode(parentNode);
            candidates.add(parentNode);
        }
        /*
             * If the grandparent has no parent, we don't want it as a candidate. It's probably a symptom that
             * we're operating in an orphan.
             */
        if (grandParentNode.parent() != null && "".equals(grandParentNode.attr("readability"))) {
            initializeNode(grandParentNode);
            candidates.add(grandParentNode);
        }
        double contentScore = 0;
        /* Add a point for the paragraph itself as a base. */
        contentScore++;
        /* Add points for any commas within this paragraph */
        contentScore += innerText.split(",").length;
        /*
             * For every 100 characters in this paragraph, add another point. Up to 3 points.
             */
        contentScore += Math.min(Math.floor(innerText.length() / 100.0), 3.0);
        /* Add the score to the parent. The grandparent gets half. */
        incrementContentScore(parentNode, contentScore);
        if (grandParentNode != null) {
            incrementContentScore(grandParentNode, contentScore / 2.0);
        }
    }
    /**
         * After we've calculated scores, loop through all of the possible candidate nodes we found and find
         * the one with the highest score.
         **/
    Element topCandidate = null;
    for (Element candidate : candidates) {
        /**
             * Scale the final candidates score based on link density. Good content should have a relatively
             * small link density (5% or less) and be mostly unaffected by this operation.
             **/
        double score = getContentScore(candidate);
        double newScore = score * (1.0 - getLinkDensity(candidate));
        setContentScore(candidate, newScore);
        LOG.debug("Candidate [" + candidate.getClass() + "] (" + candidate.className() + ":" + candidate.id() + ") with score " + newScore);
        if (null == topCandidate || newScore > getContentScore(topCandidate)) {
            topCandidate = candidate;
        }
    }
    /**
         * If we still have no top candidate, just use the body as a last resort. We also have to copy the
         * body node so it is something we can modify.
         **/
    if (topCandidate == null || topCandidate == body) {
        topCandidate = document.createElement("div");
        // not efficient but not likely.
        topCandidate.html(pageElement.html());
        pageElement.html("");
        pageElement.appendChild(topCandidate);
        initializeNode(topCandidate);
    }
    /**
         * Now that we have the top candidate, look through its siblings for content that might also be
         * related. Things like preambles, content split by ads that we removed, etc.
         **/
    Element articleContent = document.createElement("div");
    if (isPaging) {
        articleContent.attr("id", "readability-content");
    }
    double siblingScoreThreshold = Math.max(10, getContentScore(topCandidate) * 0.2);
    List<Element> siblingNodes = topCandidate.parent().children();
    for (Element siblingNode : siblingNodes) {
        boolean scored = isElementScored(siblingNode);
        boolean append = false;
        LOG.debug("Looking at sibling node: [" + siblingNode.getClass() + "] (" + siblingNode.className() + ":" + siblingNode.id() + ")");
        if (scored) {
            LOG.debug("Sibling has score " + getContentScore(siblingNode));
        } else {
            LOG.debug("Sibling has score unknown");
        }
        if (siblingNode == topCandidate) {
            append = true;
        }
        double contentBonus = 0;
        /*
             * Give a bonus if sibling nodes and top candidates have the example same classname
             */
        if (siblingNode.className().equals(topCandidate.className()) && !"".equals(topCandidate.className())) {
            contentBonus += getContentScore(topCandidate) * 0.2;
        }
        if (scored && (getContentScore(siblingNode) + contentBonus >= siblingScoreThreshold)) {
            append = true;
        }
        if ("p".equals(siblingNode.tagName())) {
            double linkDensity = getLinkDensity(siblingNode);
            String nodeContent = siblingNode.text();
            int nodeLength = nodeContent.length();
            if (nodeLength > 80 && linkDensity < 0.25) {
                append = true;
            } else if (nodeLength < 80 && linkDensity == 0 && Patterns.exists(Patterns.ENDS_WITH_DOT, nodeContent)) {
                append = true;
            }
        }
        if (append) {
            LOG.debug("Appending node: [" + siblingNode.getClass() + "]");
            Element nodeToAppend = null;
            if (!"div".equals(siblingNode.tagName()) && !"p".equals(siblingNode.tagName())) {
                /*
                     * We have a node that isn't a common block level element, like a form or td tag. Turn it
                     * into a div so it doesn't get filtered out later by accident.
                     */
                LOG.debug("Altering siblingNode of " + siblingNode.tagName() + " to div.");
                nodeToAppend = changeElementTag(siblingNode, "div");
            } else {
                nodeToAppend = siblingNode;
            }
            /*
                 * To ensure a node does not interfere with readability styles, remove its classnames
                 */
            nodeToAppend.removeAttr("class");
            /*
                 * Append sibling and subtract from our list because it removes the node when you append to
                 * another node
                 */
            articleContent.appendChild(nodeToAppend);
        }
    }
    document.body().empty();
    document.body().appendChild(articleContent);
    /**
         * So we have all of the content that we need. Now we clean it up for presentation.
         **/
    prepArticle(articleContent);
    /**
         * Now that we've gone through the full algorithm, check to see if we got any meaningful content. If
         * we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of finding
         * the -right- content.
         **/
    if (articleContent.text().length() < 250) {
        pageElement.html(pageCacheHtml);
        if (stripUnlikelyCandidates) {
            try {
                stripUnlikelyCandidates = false;
                return grabArticle(pageElement);
            } finally {
                stripUnlikelyCandidates = true;
            }
        } else if (classWeight) {
            try {
                classWeight = false;
                return grabArticle(pageElement);
            } finally {
                classWeight = true;
            }
        } else if (cleanConditionally) {
            try {
                cleanConditionally = false;
                return grabArticle(pageElement);
            } finally {
                cleanConditionally = true;
            }
        } else {
            return null;
        }
    }
    return articleContent;
}

Example 41

Project: ez-vcard-master File: HCardElement.java View source code

private void visitForValue(Element element, StringBuilder value) {
    for (Node node : element.childNodes()) {
        if (node instanceof Element) {
            Element e = (Element) node;
            if (e.classNames().contains("type")) {
                //ignore "type" elements
                continue;
            }
            if ("br".equals(e.tagName())) {
                //convert "<br>" to a newline
                value.append(NEWLINE);
                continue;
            }
            if ("del".equals(e.tagName())) {
                //skip "<del>" tags
                continue;
            }
            visitForValue(e, value);
            continue;
        }
        if (node instanceof TextNode) {
            TextNode t = (TextNode) node;
            value.append(t.text());
            continue;
        }
    }
}

Example 42

Project: Java-readability-master File: Readability.java View source code

// CHECKSTYLE:OFF
private Element grabArticle(Element pageElement) {
    boolean isPaging = pageElement != null;
    if (pageElement == null) {
        pageElement = body;
    }
    String pageCacheHtml = pageElement.html();
    Elements allElements = pageElement.getAllElements();
    /*
         * Note: in Javascript, this list would be *live*. If you deleted a node from the tree, it and its
         * children would remove themselves. To get the same effect, we make a linked list and we remove
         * things from it. This won't win prizes for speed, but, then again, the code in Javascript has to be
         * doing something nearly as awful.
         */
    LinkedList<Element> allElementsList = new LinkedList<Element>();
    allElementsList.addAll(allElements);
    /**
         * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc),
         * and turn divs into P tags where they have been used inappropriately (as in, where they contain no
         * other block level elements.) Note: Assignment from index for performance. See
         * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse
         * traversal?
         **/
    List<Element> nodesToScore = new ArrayList<Element>();
    ListIterator<Element> elIterator = allElementsList.listIterator();
    Set<Element> goodAsDead = new HashSet<Element>();
    while (elIterator.hasNext()) {
        Element node = elIterator.next();
        if (goodAsDead.contains(node)) {
            continue;
        }
        /* Remove unlikely candidates */
        if (stripUnlikelyCandidates) {
            String unlikelyMatchString = node.className() + node.id();
            if (Patterns.exists(Patterns.UNLIKELY_CANDIDATES, unlikelyMatchString) && !Patterns.exists(Patterns.OK_MAYBE_ITS_A_CANDIDATE, unlikelyMatchString) && !"body".equals(node.tagName())) {
                LOG.debug("Removing unlikely candidate - " + unlikelyMatchString);
                List<Element> toRemoveAndBelow = node.getAllElements();
                elIterator.remove();
                /*
                     * adding 'node' to that set is harmless and reduces the code complexity here.
                     */
                goodAsDead.addAll(toRemoveAndBelow);
                continue;
            }
        }
        if ("p".equals(node.tagName()) || "td".equals(node.tagName()) || "pre".equals(node.tagName())) {
            nodesToScore.add(node);
        }
        /*
             * Turn all divs that don't have children block level elements into p's
             */
        if ("div".equals(node.tagName())) {
            boolean hasBlock = false;
            for (Element divChild : node.getAllElements()) {
                if (divChild != node) {
                    if (DIV_TO_P_ELEMENTS.contains(divChild.tagName())) {
                        hasBlock = true;
                        break;
                    }
                }
            }
            if (!hasBlock) {
                Element newElement = changeElementTag(node, "p");
                nodesToScore.remove(node);
                nodesToScore.add(newElement);
            } else {
                /* EXPERIMENTAL */
                //*
                int limit = node.childNodes().size();
                for (int i = 0; i < limit; i++) {
                    Node childNode = node.childNodes().get(i);
                    if (childNode instanceof TextNode) {
                        Element p = document.createElement("p");
                        p.attr("basisInline", "true");
                        p.html(((TextNode) childNode).text());
                        childNode.replaceWith(p);
                    }
                }
            }
        }
    }
    /**
         * Loop through all paragraphs, and assign a score to them based on how content-y they look. Then add
         * their score to their parent node. A score is determined by things like number of commas, class
         * names, etc. Maybe eventually link density.
         **/
    List<Element> candidates = new ArrayList<Element>();
    for (Element nodeToScore : nodesToScore) {
        Element parentNode = nodeToScore.parent();
        if (null == parentNode) {
            // dropped previously.
            continue;
        }
        Element grandParentNode = parentNode.parent();
        if (grandParentNode == null) {
            // ditto
            continue;
        }
        String innerText = nodeToScore.text();
        /*
             * If this paragraph is less than 25 characters, don't even count it.
             */
        if (innerText.length() < 25) {
            continue;
        }
        /* Initialize readability data for the parent. */
        if ("".equals(parentNode.attr("readability"))) {
            initializeNode(parentNode);
            candidates.add(parentNode);
        }
        /*
             * If the grandparent has no parent, we don't want it as a candidate. It's probably a symptom that
             * we're operating in an orphan.
             */
        if (grandParentNode.parent() != null && "".equals(grandParentNode.attr("readability"))) {
            initializeNode(grandParentNode);
            candidates.add(grandParentNode);
        }
        double contentScore = 0;
        /* Add a point for the paragraph itself as a base. */
        contentScore++;
        /* Add points for any commas within this paragraph */
        contentScore += innerText.split(",").length;
        /*
             * For every 100 characters in this paragraph, add another point. Up to 3 points.
             */
        contentScore += Math.min(Math.floor(innerText.length() / 100.0), 3.0);
        /* Add the score to the parent. The grandparent gets half. */
        incrementContentScore(parentNode, contentScore);
        if (grandParentNode != null) {
            incrementContentScore(grandParentNode, contentScore / 2.0);
        }
    }
    /**
         * After we've calculated scores, loop through all of the possible candidate nodes we found and find
         * the one with the highest score.
         **/
    Element topCandidate = null;
    for (Element candidate : candidates) {
        /**
             * Scale the final candidates score based on link density. Good content should have a relatively
             * small link density (5% or less) and be mostly unaffected by this operation.
             **/
        double score = getContentScore(candidate);
        double newScore = score * (1.0 - getLinkDensity(candidate));
        setContentScore(candidate, newScore);
        LOG.debug("Candidate [" + candidate.getClass() + "] (" + candidate.className() + ":" + candidate.id() + ") with score " + newScore);
        if (null == topCandidate || newScore > getContentScore(topCandidate)) {
            topCandidate = candidate;
        }
    }
    /**
         * If we still have no top candidate, just use the body as a last resort. We also have to copy the
         * body node so it is something we can modify.
         **/
    if (topCandidate == null || topCandidate == body) {
        topCandidate = document.createElement("div");
        // not efficient but not likely.
        topCandidate.html(pageElement.html());
        pageElement.html("");
        pageElement.appendChild(topCandidate);
        initializeNode(topCandidate);
    }
    /**
         * Now that we have the top candidate, look through its siblings for content that might also be
         * related. Things like preambles, content split by ads that we removed, etc.
         **/
    Element articleContent = document.createElement("div");
    if (isPaging) {
        articleContent.attr("id", "readability-content");
    }
    double siblingScoreThreshold = Math.max(10, getContentScore(topCandidate) * 0.2);
    List<Element> siblingNodes = topCandidate.parent().children();
    for (Element siblingNode : siblingNodes) {
        boolean scored = isElementScored(siblingNode);
        boolean append = false;
        LOG.debug("Looking at sibling node: [" + siblingNode.getClass() + "] (" + siblingNode.className() + ":" + siblingNode.id() + ")");
        if (scored) {
            LOG.debug("Sibling has score " + getContentScore(siblingNode));
        } else {
            LOG.debug("Sibling has score unknown");
        }
        if (siblingNode == topCandidate) {
            append = true;
        }
        double contentBonus = 0;
        /*
             * Give a bonus if sibling nodes and top candidates have the example same classname
             */
        if (siblingNode.className().equals(topCandidate.className()) && !"".equals(topCandidate.className())) {
            contentBonus += getContentScore(topCandidate) * 0.2;
        }
        if (scored && (getContentScore(siblingNode) + contentBonus >= siblingScoreThreshold)) {
            append = true;
        }
        if ("p".equals(siblingNode.tagName())) {
            double linkDensity = getLinkDensity(siblingNode);
            String nodeContent = siblingNode.text();
            int nodeLength = nodeContent.length();
            if (nodeLength > 80 && linkDensity < 0.25) {
                append = true;
            } else if (nodeLength < 80 && linkDensity == 0 && Patterns.exists(Patterns.ENDS_WITH_DOT, nodeContent)) {
                append = true;
            }
        }
        if (append) {
            LOG.debug("Appending node: [" + siblingNode.getClass() + "]");
            Element nodeToAppend = null;
            if (!"div".equals(siblingNode.tagName()) && !"p".equals(siblingNode.tagName())) {
                /*
                     * We have a node that isn't a common block level element, like a form or td tag. Turn it
                     * into a div so it doesn't get filtered out later by accident.
                     */
                LOG.debug("Altering siblingNode of " + siblingNode.tagName() + " to div.");
                nodeToAppend = changeElementTag(siblingNode, "div");
            } else {
                nodeToAppend = siblingNode;
            }
            /*
                 * To ensure a node does not interfere with readability styles, remove its classnames
                 */
            nodeToAppend.removeAttr("class");
            /*
                 * Append sibling and subtract from our list because it removes the node when you append to
                 * another node
                 */
            articleContent.appendChild(nodeToAppend);
        }
    }
    document.body().empty();
    document.body().appendChild(articleContent);
    /**
         * So we have all of the content that we need. Now we clean it up for presentation.
         **/
    prepArticle(articleContent);
    /**
         * Now that we've gone through the full algorithm, check to see if we got any meaningful content. If
         * we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of finding
         * the -right- content.
         **/
    if (articleContent.text().length() < 250) {
        pageElement.html(pageCacheHtml);
        if (stripUnlikelyCandidates) {
            try {
                stripUnlikelyCandidates = false;
                return grabArticle(pageElement);
            } finally {
                stripUnlikelyCandidates = true;
            }
        } else if (classWeight) {
            try {
                classWeight = false;
                return grabArticle(pageElement);
            } finally {
                classWeight = true;
            }
        } else if (cleanConditionally) {
            try {
                cleanConditionally = false;
                return grabArticle(pageElement);
            } finally {
                cleanConditionally = true;
            }
        } else {
            return null;
        }
    }
    return articleContent;
}

Example 43

Project: jooby-master File: Doc.java View source code

@Override
public void head(final Node node, final int depth) {
    if (!isInToc) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            // non-break spaces
            String txt = textNode.text().replaceAll(" ", " ");
            builder.append(txt);
        } else if (node instanceof Element) {
            Element element = (Element) node;
            switch(element.tagName()) {
                case "span":
                case "blockquote":
                    // ignored
                    break;
                case "ol":
                case "ul":
                    listDepth += 1;
                case "br":
                case "p":
                    builder.append("\n");
                    break;
                case "div":
                    builder.append("\n");
                    break;
                case "h1":
                    builder.append("\n# ");
                    break;
                case "h2":
                    builder.append("\n## ");
                    break;
                case "h3":
                    builder.append("\n### ");
                    break;
                case "h4":
                    builder.append("\n#### ");
                case "b":
                case "strong":
                    builder.append("**");
                    break;
                case "cite":
                case "i":
                case "u":
                    builder.append("*");
                    break;
                case "a":
                    builder.append('[');
                    break;
                case "li":
                    for (int i = 0; i < listDepth - 1; i++) {
                        builder.append(" ");
                    }
                    builder.append(element.parent().tagName().equals("ol") ? "1. " : "* ");
                    break;
                case "code":
                    builder.append("`");
                    break;
                case "strike":
                    builder.append("<").append(element.tagName()).append(">");
                    break;
                case "img":
                    String src = element.attr("src");
                    String alt = element.attr("alt");
                    alt = alt == null ? "" : alt;
                    if (src != null) {
                        builder.append("![").append(alt).append("](").append(src).append(")\n");
                    }
                    break;
                case "pre":
                    builder.append("```\n");
                    break;
                case "hr":
                    builder.append("\n***\n");
                    break;
                case "font":
                    String face = element.attr("face");
                    if (face != null && face.contains("monospace")) {
                        builder.append("`");
                    }
                    break;
                default:
                    log.debug("Unhandled element {}", element.tagName());
            }
        }
    }
}

Example 44

Project: LastCalc-master File: Renderers.java View source code

private static void renderObject(final String baseUri, final Map<String, Integer> variables, final Element renderTo, final Object obj) {
    renderTo.append(" ");
    if (obj instanceof Map) {
        final Map<Object, Object> map = (Map<Object, Object>) obj;
        final int mapSize = map.size();
        final Element mapSpan = renderTo.appendElement("span").addClass("map");
        mapSpan.append("{");
        int count = 0;
        for (final Entry<Object, Object> e : map.entrySet()) {
            renderObject(baseUri, variables, mapSpan, e.getKey());
            mapSpan.append(" :");
            renderObject(baseUri, variables, mapSpan, e.getValue());
            if (count < mapSize - 1) {
                mapSpan.append(", ");
            }
            count++;
        }
        mapSpan.append("}");
        final int textLength = mapSpan.text().length();
        if (textLength > 120) {
            mapSpan.html("{ too big (" + textLength + " chars) }");
        }
    } else if (obj instanceof List) {
        final List<Object> list = (List<Object>) obj;
        final int listSize = list.size();
        final Element listSpan = renderTo.appendElement("span").addClass("map");
        listSpan.append("[");
        int count = 0;
        for (final Object e : list) {
            renderObject(baseUri, variables, listSpan, e);
            if (count < listSize - 1) {
                listSpan.append(", ");
            }
            count++;
        }
        listSpan.append("]");
        final int textLength = listSpan.text().length();
        if (textLength > 120) {
            listSpan.html("[ too big (" + textLength + " chars) ]");
        }
    } else if (obj instanceof Amount) {
        final Amount<?> amount = (Amount<?>) obj;
        Unit<? extends Quantity> unit = amount.getUnit();
        log.log(Level.INFO, "Amount: " + amount + ", unit type: " + unit.getClass().getCanonicalName());
        final Element amountSpan = renderTo.appendElement("span").addClass("amount");
        final double estimatedValue = amount.getEstimatedValue();
        if (unit instanceof Currency) {
            final Element currencySpan = amountSpan.appendElement("span").addClass("currency");
            final Currency currency = (Currency) unit;
            if (currency.getCode().equalsIgnoreCase("USD")) {
                currencySpan.html("US$" + currencyFormat.format(estimatedValue));
            } else if (currency.getCode().equalsIgnoreCase("GBP")) {
                currencySpan.html("£" + currencyFormat.format(estimatedValue));
            } else if (currency.getCode().equalsIgnoreCase("EUR")) {
                currencySpan.html("€" + currencyFormat.format(estimatedValue));
            } else if (currency.getCode().equalsIgnoreCase("JPY")) {
                currencySpan.html("¥" + currencyFormat.format(estimatedValue));
            } else {
                currencySpan.text(currencyFormat.format(estimatedValue) + currency.getCode());
            }
        } else if (unit.equals(NonSI.FAHRENHEIT) || unit.equals(SI.CELSIUS) || unit.equals(SI.KELVIN)) {
            // Avoid "33 fahrenheits"
            final Element temperatureSpan = amountSpan.appendElement("span").addClass("temperature");
            temperatureSpan.text(estimatedValue + unit.toString());
        } else {
            final String numStr = Misc.numberFormat.format(estimatedValue);
            amountSpan.appendElement("span").addClass("number").text(numStr);
            amountSpan.appendText(" ");
            if (!unit.equals(Unit.ONE)) {
                final Element unitSpan = amountSpan.appendElement("span").addClass("recognized");
                final String verboseName = estimatedValue == 1.0 ? UnitParser.verboseNamesSing.get(unit) : UnitParser.verboseNamesPlur.get(unit);
                if (verboseName != null) {
                    unitSpan.text(verboseName);
                } else {
                    unitSpan.text(unit.toString());
                }
            }
        }
    } else if (obj instanceof org.jscience.mathematics.number.Number) {
        final org.jscience.mathematics.number.Number<?> num = (org.jscience.mathematics.number.Number<?>) obj;
        final String numStr = Misc.numberFormat.format(num.doubleValue());
        renderTo.appendElement("span").addClass("number").text(numStr);
    } else if (obj instanceof Radix) {
        renderTo.appendElement("span").addClass("number").text(obj.toString());
    } else if (obj instanceof UserDefinedParser) {
        renderTo.appendChild(toHtml(baseUri, ((UserDefinedParser) obj).after));
    } else if (variables.containsKey(obj)) {
        final String color = variableColors.get(variables.get(obj) % variableColors.size());
        renderTo.appendElement("span").addClass("highlighted").addClass("variable").addClass(color).text((String) obj);
    } else if (obj instanceof String && Character.isUpperCase(((String) obj).charAt(0))) {
        renderTo.appendElement("span").addClass("highlighted").addClass("variable").addClass("white").text((String) obj);
    } else if (obj instanceof DocumentWrapper) {
        renderTo.append("<html>" + ((DocumentWrapper) obj).title() + " ... </html>");
    } else {
        renderTo.appendChild(new TextNode(obj.toString(), baseUri));
    }
}

Example 45

Project: Lightning-Browser-master File: OutputFormatter.java View source code

private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}

Example 46

Project: NiceText-master File: NTHelper.java View source code

private void removeFat(Document doc) {
    //String[] commonLinks = new String[] {"subscribe",""}
    for (String UNWRAP_TAG : UNWRAP_TAGS) {
        doc.select(UNWRAP_TAG).unwrap();
    }
    for (Element element : doc.body().getElementsByTag("br")) {
        if (element != null && element.tagName().equalsIgnoreCase("br")) {
            element.replaceWith(new TextNode("\n", null));
        }
    }
    for (Element element : doc.body().getAllElements()) {
        String tagName = element.tagName();
        if (tagName.equalsIgnoreCase("script") || tagName.equalsIgnoreCase("noscript") || tagName.equalsIgnoreCase("style")) {
            element.remove();
        } else if (tagName.equalsIgnoreCase("a")) {
            if (element.text().length() > 40) {
                element.remove();
            } else if (!POSSIBLE_TEXT_NODES.matcher(element.parent().tagName()).matches() || element.parent().ownText().length() == 0) {
                element.remove();
            }
        } else if (element.text().length() < WORDS_T) {
            element.remove();
        } else if (element.ownText().split("\\|").length > 3) {
            element.remove();
        }
    /*else if (element.ownText().contains("...")) {
                element.remove();
            }*/
    }
}

Example 47

Project: shopb2b-master File: Article.java View source code

@Transient
public String[] getPageContents() {
    if (StringUtils.isEmpty(this.content))
        return new String[] { "" };
    if (this.content.contains(contentBreake))
        return this.content.split(contentBreake);
    ArrayList<String> localArrayList = new ArrayList<String>();
    org.jsoup.nodes.Document localDocument = Jsoup.parse(this.content);
    List<Node> localList = localDocument.body().childNodes();
    if (localList != null) {
        int i = 0;
        StringBuffer localStringBuffer = new StringBuffer();
        Iterator<Node> localIterator = localList.iterator();
        while (localIterator.hasNext()) {
            Node localObject1 = (Node) localIterator.next();
            Object localObject2;
            if ((localObject1 instanceof org.jsoup.nodes.Element)) {
                localObject2 = (org.jsoup.nodes.Element) localObject1;
                localStringBuffer.append(((org.jsoup.nodes.Element) localObject2).outerHtml());
                i += ((org.jsoup.nodes.Element) localObject2).text().length();
                if (i < MAX_PAGE_CONTENT_COUNT)
                    continue;
                localArrayList.add(localStringBuffer.toString());
                i = 0;
                localStringBuffer.setLength(0);
            } else {
                if (!(localObject1 instanceof TextNode))
                    continue;
                localObject2 = (TextNode) localObject1;
                String str1 = ((TextNode) localObject2).text();
                String[] arrayOfString1 = pattern.split(str1);
                Matcher localMatcher = pattern.matcher(str1);
                for (String str2 : arrayOfString1) {
                    if (localMatcher.find())
                        str2 = str2 + localMatcher.group();
                    localStringBuffer.append(str2);
                    i += str2.length();
                    if (i < MAX_PAGE_CONTENT_COUNT)
                        continue;
                    localArrayList.add(localStringBuffer.toString());
                    i = 0;
                    localStringBuffer.setLength(0);
                }
            }
        }
        String localObject1 = localStringBuffer.toString();
        if (StringUtils.isNotEmpty((String) localObject1))
            localArrayList.add(localObject1);
    }
    return (String[]) localArrayList.toArray(new String[localArrayList.size()]);
}

Example 48

Project: SubTools-master File: JAddic7edApi.java View source code

public List<Addic7edSubtitleDescriptor> searchSubtitles(String showname, int season, int episode, String title) throws Exception {
    // http://www.addic7ed.com/serie/Smallville/9/11/Absolute_Justice
    String url = "http://www.addic7ed.com/serie/" + showname.toLowerCase().replace(" ", "_") + "/" + season + "/" + episode + "/" + title.toLowerCase().replace(" ", "_").replace("#", "");
    String content = this.getContent(false, url);
    List<Addic7edSubtitleDescriptor> lSubtitles = new ArrayList<Addic7edSubtitleDescriptor>();
    Document doc = Jsoup.parse(content);
    String titel = null;
    Elements elTitel = doc.getElementsByClass("titulo");
    if (elTitel.size() == 1) {
        titel = elTitel.get(0).html().substring(0, elTitel.get(0).html().indexOf("<") - 1).trim();
    }
    String uploader, version, lang, download = null;
    boolean hearingImpaired = false;
    Elements blocks = doc.getElementsByClass("tabel95");
    blocks = blocks.select("table[width=100%]");
    for (Element block : blocks) {
        uploader = "";
        version = null;
        lang = null;
        download = null;
        hearingImpaired = false;
        Elements classesNewsTitle = block.getElementsByClass("NewsTitle");
        Elements classesNewsDate = block.getElementsByClass("newsDate").select("td[colspan=3]");
        Elements imgHearingImpaired = block.select("img").select("img[title~=Hearing]");
        if (classesNewsTitle.size() == 1 && classesNewsDate.size() == 1) {
            TextNode tn = (TextNode) classesNewsTitle.get(0).childNode(1);
            Matcher m = pattern.matcher(tn.text());
            if (!m.find()) {
                break;
            } else {
                version = m.group().substring(0, m.group().lastIndexOf(",")).replace("Version", "") + (" ") + classesNewsDate.get(0).text().trim();
                uploader = block.getElementsByTag("a").select("a[href*=user/]").get(0).text();
                hearingImpaired = imgHearingImpaired.size() > 0;
            }
        }
        if (version != null) {
            Elements tds = block.select("tr:contains(Completed)");
            Elements reqTds = tds.select("td").not("td[rowspan=2]");
            for (Element td : reqTds) {
                if (td.hasClass("language")) {
                    lang = td.html().substring(0, td.html().indexOf("<"));
                }
                if (lang != null && td.toString().toLowerCase().contains("completed")) {
                    // incompleted not wanted
                    if (td.html().toLowerCase().contains("% completed"))
                        lang = null;
                }
                if (lang != null && td.getElementsByClass("buttonDownload").size() > 0) {
                    Elements a = td.getElementsByClass("buttonDownload");
                    if (a.size() == 1) {
                        download = "http://www.addic7ed.com" + a.get(0).attr("href");
                    }
                    if (a.size() == 2) {
                        download = "http://www.addic7ed.com" + a.get(1).attr("href");
                    }
                }
                if (lang != null && download != null && titel != null) {
                    Addic7edSubtitleDescriptor sub = new Addic7edSubtitleDescriptor();
                    sub.setUploader(uploader);
                    sub.setTitel(titel.trim());
                    sub.setVersion(version.trim());
                    sub.setUrl(download);
                    sub.setLanguage(lang.trim());
                    sub.setHearingImpaired(hearingImpaired);
                    if (!isDuplicate(lSubtitles, sub)) {
                        lSubtitles.add(sub);
                    }
                    lang = null;
                    download = null;
                }
            }
        }
    }
    return lSubtitles;
}

Example 49

Project: uzlee-master File: ThreadsParser.java View source code

public Threads parseMessages(String html) {
    Document doc = getDoc(html);
    Elements pms = doc.select("ul.pm_list li.s_clear");
    Threads threads = new Threads();
    for (Element pm : pms) {
        try {
            Elements eUser = pm.select("p.cite a");
            String userName = eUser.text();
            String userLink = eUser.attr("href");
            String uid = Utils.getUriQueryParameter(userLink).get("uid");
            User u = new User().setId(Integer.valueOf(uid)).setName(userName);
            String title = pm.select("div.summary").text();
            boolean isNew = pm.select("img[alt=NEW]").size() != 0;
            String dateStr = ((TextNode) pm.select("p.cite").get(0).childNode(2)).text().replaceAll(" ", "");
            Thread thread = new Thread().setTitle(title).setAuthor(u).setNew(isNew).setDateStr(dateStr);
            threads.add(thread);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    int currPage = 1;
    Elements page = doc.select("div.pages > strong");
    if (page.size() > 0) {
        currPage = Integer.valueOf(page.first().text());
    }
    boolean hasNextPage = doc.select("div.pages > a[href$=&page=" + (currPage + 1) + "]").size() > 0;
    threads.getMeta().setHasNextPage(hasNextPage);
    threads.getMeta().setPage(currPage);
    return threads;
}

Example 50

Project: web-entity-extractor-ACL2014-master File: KnowledgeTreeBuilder.java View source code

/**
   * Convert jsoup Element (= an HTML tag and its content) into a knowledge tree.
   * Contents inside style tag (CSS) and script tag (JavaScript) are ignored.
   * 
   * @param elt       The jsoup Element corresponding to the root of the tree
   * @param parent    The parent of the created tree's root node.
   */
public void convertElementToKTree(Element elt, KNode parent) {
    String eltText = LingUtils.normalize(elt.text(), opts.earlyNormalizeEntities);
    KNode currentNode = parent.createChild(KNode.Type.TAG, elt.tagName(), eltText.length() > opts.maxFullTextLength ? null : eltText);
    // Add children
    for (Node child : elt.childNodes()) {
        if (child instanceof Element) {
            convertElementToKTree((Element) child, currentNode);
        } else if (child instanceof TextNode) {
            if (!opts.ignoreTextNodes) {
                String text = LingUtils.normalize(((TextNode) child).text(), opts.earlyNormalizeEntities);
                if (!text.isEmpty()) {
                    //currentNode.createChild(KNode.Type.TEXT, text, text);
                    currentNode.createChild(KNode.Type.TAG, "text", text.length() > opts.maxFullTextLength ? null : text);
                }
            }
        }
    }
    // Add attributes
    for (Attribute attr : elt.attributes()) {
        currentNode.createAttribute(attr.getKey(), attr.getValue());
    }
}

Example 51

Project: awesome-blogs-android-master File: DocumentConverter.java View source code

// Utility method to quickly walk the DOM tree and estimate the size of the
// buffer necessary to hold the result.
private static int calculateLength(Element el, int depth) {
    int result = 0;
    for (final Node n : el.childNodes()) {
        if (n instanceof Element) {
            result += (4 * depth) + calculateLength((Element) n, depth + 1);
        } else if (n instanceof TextNode) {
            result += ((TextNode) n).text().length();
        }
    }
    return result;
}

Example 52

Project: elasticsearch-river-remote-master File: GetSitemapHtmlClient.java View source code

@Override
public void head(Node node, int depth) {
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        // non breaking space
        String text = textNode.text().replace(' ', ' ').trim();
        if (!text.isEmpty()) {
            buffer.append(text);
            if (!text.endsWith(" ")) {
                buffer.append(" ");
            }
        }
    }
}

Example 53

Project: JAVMovieScraper-master File: AvEntertainmentParsingProfile.java View source code

@Override
public Runtime scrapeRuntime() {
    String runtime = "";
    Elements elements = document.select("div[id=titlebox] ul li");
    for (Element element : elements) {
        if (element.childNodeSize() == 3) {
            Node childNode = element.childNode(2);
            if (childNode instanceof TextNode && (element.childNode(1).childNode(0).toString().startsWith("Playing time") || element.childNode(1).childNode(0).toString().startsWith("å?ŽéŒ²æ™‚é–“"))) {
                String data = element.childNode(2).toString();
                Pattern pattern = Pattern.compile("\\d+");
                Matcher matcher = pattern.matcher(data);
                if (matcher.find()) {
                    runtime = matcher.group();
                    break;
                }
            }
        }
    }
    return new Runtime(runtime);
}

Example 54

Project: jHTML2Md-master File: HTML2Md.java View source code

private static String getTextContent(Element element) {
    ArrayList<MDLine> lines = new ArrayList<MDLine>();
    List<Node> children = element.childNodes();
    for (Node child : children) {
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            MDLine line = getLastLine(lines);
            if (line.getContent().equals("")) {
                if (!textNode.isBlank()) {
                    line.append(textNode.text().replaceAll("#", "/#").replaceAll("\\*", "/\\*"));
                }
            } else {
                line.append(textNode.text().replaceAll("#", "/#").replaceAll("\\*", "/\\*"));
            }
        } else if (child instanceof Element) {
            Element childElement = (Element) child;
            processElement(childElement, lines);
        } else {
            System.out.println();
        }
    }
    int blankLines = 0;
    StringBuilder result = new StringBuilder();
    for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i).toString().trim();
        if (line.equals("")) {
            blankLines++;
        } else {
            blankLines = 0;
        }
        if (blankLines < 2) {
            result.append(line);
            if (i < lines.size() - 1) {
                result.append("\n");
            }
        }
    }
    return result.toString();
}

Example 55

Project: jodtemplate-master File: HtmlStylizer.java View source code

private List<Element> process(final org.jsoup.nodes.Element element, final Element arPr, final Element apPr, final Slide slide) throws IOException {
    if (BR_TAG.equals(element.tagName())) {
        return Arrays.asList(new Element(PPTXDocument.BR_ELEMENT, getDrawingmlNamespace()));
    }
    final List<org.jsoup.nodes.Element> tags = getAllTags(element);
    final List<Element> elements = new ArrayList<>();
    for (Node node : element.childNodes()) {
        if (node instanceof org.jsoup.nodes.Element) {
            elements.addAll(process((org.jsoup.nodes.Element) node, arPr, apPr, slide));
        } else if (node instanceof TextNode) {
            final TextNode textNode = (TextNode) node;
            elements.add(createTextElement(tags, arPr, textNode, slide));
        }
    }
    if (LI_TAG.equals(element.tagName())) {
        return createListElements(tags, elements, apPr, element);
    }
    if (P_TAG.equals(element.tagName())) {
        return Arrays.asList(createParagraphElement(elements, apPr));
    }
    return elements;
}

Example 56

Project: Mover-master File: MoverParser.java View source code

public Channel getChannelExpandedInfo(Element element, Channel channel) {
    Elements channelBox = element.select("div#channel-box");
    // Parsed user picture source link
    String userPicture = channelBox.select("a.userpic img").first().attr("src");
    channel.setPicture(userPicture);
    // Channel Display Name
    String displayName = channelBox.select("div.info div.user").first().text();
    channel.setDisplayName(displayName);
    String videosCount = channelBox.select("div.info div.videos").first().text();
    channel.setVideosCount(internalGetIntegers(videosCount));
    List<TextNode> dataNodes = channelBox.select("div.data").first().textNodes();
    // Magic 1 is to get only registrationDate text information
    // Here is HARD CORE NEVER REPEAT THIS CODE CUT
    String registrationDate = dataNodes.get(1).text().replace("Ð ÐµÐ³Ð¸Ñ?Ñ‚Ñ€Ð°Ñ†Ð¸Ñ?:", "").trim();
    channel.setRegistrationDate(parseRussianFormat(PROFILE_FORMAT, registrationDate));
    String profileViewsCount = dataNodes.get(2).text();
    channel.setProfileViewsCount(internalGetIntegers(profileViewsCount));
    return channel;
}

Example 57

Project: scheduler-legacy-master File: CourseParser.java View source code

/**
	 * Parse the Catalog Entry page for a given course to retrieve the long description of the course, the credit 
	 * hour breakdown, and the department of the course
	 * 
	 * @param document the Catalog Entry page HTML document
	 * @param values the retrieved course data set, including the newly added Catalog Entry values
	 */
private void parseCatalogEntry(Document document, Map<String, String> values) {
    //Long description is in the first text node in the table
    Element longDetailElement = document.select("table.datadisplaytable td.ntdefault").first();
    String longDetail = longDetailElement.textNodes().get(0).toString();
    values.put("description", longDetail);
    //Credit hours are in TextNodes following the long description
    List<TextNode> creditNodes = longDetailElement.textNodes();
    for (TextNode creditNode : creditNodes) {
        String text = creditNode.text();
        try (Scanner scanner = new Scanner(text)) {
            scanner.useDelimiter(" ");
            if (text.contains("TO")) {
                logger.debug("Found credit range entry, will attempt to use max value in range. Range: {}", text);
                //Some catalog entries use the "X.000 TO Y.000" Credits format for the credit hours
                //in almost all cases, X is 0, so we take Y as the credit count - skip "X.000" and "TO"
                scanner.next();
                scanner.next();
            }
            if (scanner.hasNextDouble()) {
                double value = scanner.nextDouble();
                String component = scanner.next();
                values.put("credit." + component, Double.toString(value));
                logger.debug("Found credit hour entry: {}={}", component, value);
            } else {
                logger.debug("Expected credit hour text node, found instead: {}", text);
            }
        }
    }
    //Department always seems to be 3rd text node from the end of the table
    String department = longDetailElement.textNodes().get(longDetailElement.textNodes().size() - 3).toString();
    values.put("department", department);
}

Example 58

Project: symphony-master File: Markdowns.java View source code

@Override
public void head(final org.jsoup.nodes.Node node, int depth) {
    if (node instanceof org.jsoup.nodes.TextNode) {
        final org.jsoup.nodes.TextNode textNode = (org.jsoup.nodes.TextNode) node;
        final org.jsoup.nodes.Node parent = textNode.parent();
        if (parent instanceof org.jsoup.nodes.Element) {
            final Element parentElem = (Element) parent;
            if (!parentElem.tagName().equals("code")) {
                String text = textNode.getWholeText();
                if (null != userQueryService) {
                    try {
                        final Set<String> userNames = userQueryService.getUserNames(text);
                        for (final String userName : userNames) {
                            text = text.replace('@' + userName + " ", "@<a href='" + Latkes.getServePath() + "/member/" + userName + "'>" + userName + "</a> ");
                        }
                        text = text.replace("@participants ", "@<a href='https://hacpai.com/article/1458053458339' class='ft-red'>participants</a> ");
                    } finally {
                        JdbcRepository.dispose();
                    }
                }
                if (text.contains("@<a href=")) {
                    final List<org.jsoup.nodes.Node> nodes = Parser.parseFragment(text, parentElem, "");
                    final int index = textNode.siblingIndex();
                    parentElem.insertChildren(index, nodes);
                    toRemove.add(node);
                } else {
                    textNode.text(Pangu.spacingText(text));
                }
            }
        }
    }
}

Example 59

Project: holoreader-master File: RefreshFeedService.java View source code

private ContentValues prepareArticle(int feedID, String guid, String link, Date pubdate, String title, String summary, String content) {
    boolean missingContent = false;
    boolean missingSummary = false;
    if (content == null) {
        missingContent = true;
    }
    if (summary == null) {
        missingSummary = true;
    }
    if (missingContent && missingSummary) {
        return null;
    }
    if (missingContent) {
        content = summary;
    } else if (missingSummary) {
        summary = content;
    }
    Document parsedContent = Jsoup.parse(content);
    Elements iframes = parsedContent.getElementsByTag("iframe");
    TextNode placeholder = new TextNode("(video removed)", null);
    for (Element mIframe : iframes) {
        mIframe.replaceWith(placeholder);
    }
    content = parsedContent.html();
    Document parsedSummary = Jsoup.parse(summary);
    Elements pics = parsedSummary.getElementsByTag("img");
    for (Element pic : pics) {
        pic.remove();
    }
    summary = parsedSummary.text();
    if (summary.length() > SUMMARY_MAXLENGTH) {
        summary = summary.substring(0, SUMMARY_MAXLENGTH) + "...";
    }
    Element image = parsedContent.select("img").first();
    ContentValues contentValues = new ContentValues();
    contentValues.put(ArticleDAO.FEEDID, feedID);
    contentValues.put(ArticleDAO.GUID, guid);
    contentValues.put(ArticleDAO.LINK, link);
    contentValues.put(ArticleDAO.PUBDATE, SQLiteHelper.fromDate(pubdate));
    contentValues.put(ArticleDAO.TITLE, title);
    contentValues.put(ArticleDAO.SUMMARY, summary);
    contentValues.put(ArticleDAO.CONTENT, content);
    if (image != null) {
        contentValues.put(ArticleDAO.IMAGE, image.absUrl("src"));
    }
    contentValues.put(ArticleDAO.ISDELETED, 0);
    return contentValues;
}

Example 60

Project: Ouroboros-master File: CommentParser.java View source code

private CharSequence parseFormatting(Element bodyLine, String currentBoard, String resto, FragmentManager fragmentManager, InfiniteDbHelper infiniteDbHelper) {
    CharSequence parsedText = "";
    for (Node childNode : bodyLine.childNodes()) {
        if (childNode instanceof TextNode) {
            parsedText = TextUtils.concat(parsedText, parseNormalText(new SpannableString(((TextNode) childNode).text())));
        } else if (childNode instanceof Element) {
            Element childElement = (Element) childNode;
            switch(childElement.tagName()) {
                default:
                    parsedText = TextUtils.concat(parsedText, parseNormalText(new SpannableString(childElement.text())));
                    break;
                case "span":
                    CharSequence spanText = parseSpanText(childElement);
                    parsedText = TextUtils.concat(parsedText, spanText);
                    break;
                case "em":
                    parsedText = TextUtils.concat(parsedText, parseItalicText(new SpannableString(childElement.text())));
                    break;
                case "strong":
                    parsedText = TextUtils.concat(parsedText, parseBoldText(new SpannableString(childElement.text())));
                    break;
                case "u":
                    parsedText = TextUtils.concat(parsedText, parseUnderlineText(new SpannableString(childElement.text())));
                    break;
                case "s":
                    parsedText = TextUtils.concat(parsedText, parseStrikethroughText(new SpannableString(childElement.text())));
                    break;
                case "a":
                    parsedText = TextUtils.concat(parsedText, parseAnchorText(childElement, currentBoard, resto, fragmentManager, infiniteDbHelper));
            }
        }
    }
    return parsedText;
}

Example 61

Project: TuCanMobile-master File: EventsScraper.java View source code

/**
	 * Gibt einzelne Events in einem ListAdapter zurÃ¼ck.
	 * 
	 * @param content
	 *            Content div Element
	 * @return ListAdapter
	 * @author Daniel Thiem
	 */
private ListAdapter getApplicationSingleItems(Element content) {
    final Element coursestatusTable = content.select("table.tbcoursestatus").first();
    if (coursestatusTable != null) {
        Elements moduleTable = coursestatusTable.select("tr");
        ListAdapter singleEventAdapter = null;
        if (moduleTable.size() > 0) {
            // Einzelne Veranstaltungen werden angeboten
            ArrayList<String> itemName = new ArrayList<String>();
            ArrayList<String> itemInstructor = new ArrayList<String>();
            ArrayList<String> itemDate = new ArrayList<String>();
            ArrayList<Boolean> isModule = new ArrayList<Boolean>();
            for (Element next : moduleTable) {
                final Elements cols = next.select("td");
                Element firstCol = cols.first();
                if (firstCol != null && cols.size() == 4) {
                    final Element secondCol = cols.get(1);
                    List<Node> innerChilds = secondCol.childNodes();
                    if (firstCol.hasClass("tbsubhead")) {
                        if (innerChilds.size() == 4) {
                            final Node instructorNode = innerChilds.get(3);
                            if (instructorNode instanceof TextNode) {
                                String moduleInstructor = ((TextNode) instructorNode).text();
                                String moduleName = secondCol.select("span.eventTitle").text();
                                String moduleDeadline = cols.get(2).text();
                                itemName.add(moduleName);
                                itemInstructor.add(moduleInstructor);
                                itemDate.add(moduleDeadline);
                                isModule.add(true);
                            }
                        }
                    } else if (firstCol.hasClass("tbdata")) {
                        // Es handelt sich um ein Event
                        String eventName = null, eventInstructor = null, eventDates = null;
                        if (innerChilds.size() == 1) {
                            // Event nur mit Namen
                            final String evNmHtml = secondCol.html();
                            eventName = TucanMobile.getEventNameByString(evNmHtml);
                            eventInstructor = "";
                            eventDates = "";
                        } else if (innerChilds.size() == 7) {
                            // Event mit Vollinformationen
                            final Node instructorNode = innerChilds.get(4);
                            final Node dateNode = innerChilds.get(6);
                            if (instructorNode instanceof TextNode && dateNode instanceof TextNode) {
                                eventName = secondCol.select("span.eventTitle").text();
                                eventInstructor = ((TextNode) instructorNode).text().trim();
                                eventDates = ((TextNode) dateNode).text().trim();
                            }
                        } else if (innerChilds.size() == 5) {
                            // Event ohne Datum
                            final Node instructorNode = innerChilds.get(4);
                            if (instructorNode instanceof TextNode) {
                                eventName = secondCol.select("span.eventTitle").text();
                                eventInstructor = ((TextNode) instructorNode).text().trim();
                                eventDates = "";
                            }
                        }
                        itemName.add(eventName);
                        itemInstructor.add(eventInstructor);
                        itemDate.add(eventDates);
                        isModule.add(false);
                    }
                }
            }
            // Adapter zum zurÃ¼ckgeben erstellen
            singleEventAdapter = new HighlightedThreeLinesAdapter(context, itemName, itemInstructor, itemDate, isModule);
        }
        return singleEventAdapter;
    }
    return null;
}

Example 62

Project: WebCollector-master File: ContentExtractor.java View source code

protected CountInfo computeInfo(Node node) {
    if (node instanceof Element) {
        Element tag = (Element) node;
        CountInfo countInfo = new CountInfo();
        for (Node childNode : tag.childNodes()) {
            CountInfo childCountInfo = computeInfo(childNode);
            countInfo.textCount += childCountInfo.textCount;
            countInfo.linkTextCount += childCountInfo.linkTextCount;
            countInfo.tagCount += childCountInfo.tagCount;
            countInfo.linkTagCount += childCountInfo.linkTagCount;
            countInfo.leafList.addAll(childCountInfo.leafList);
            countInfo.densitySum += childCountInfo.density;
            countInfo.pCount += childCountInfo.pCount;
        }
        countInfo.tagCount++;
        String tagName = tag.tagName();
        if (tagName.equals("a")) {
            countInfo.linkTextCount = countInfo.textCount;
            countInfo.linkTagCount++;
        } else if (tagName.equals("p")) {
            countInfo.pCount++;
        }
        int pureLen = countInfo.textCount - countInfo.linkTextCount;
        int len = countInfo.tagCount - countInfo.linkTagCount;
        if (pureLen == 0 || len == 0) {
            countInfo.density = 0;
        } else {
            countInfo.density = (pureLen + 0.0) / len;
        }
        infoMap.put(tag, countInfo);
        return countInfo;
    } else if (node instanceof TextNode) {
        TextNode tn = (TextNode) node;
        CountInfo countInfo = new CountInfo();
        String text = tn.text();
        int len = text.length();
        countInfo.textCount = len;
        countInfo.leafList.add(len);
        return countInfo;
    } else {
        return new CountInfo();
    }
}

Example 63

Project: yobi-master File: AutoLinkRenderer.java View source code

private AutoLinkRenderer parse(Pattern pattern, ToLink toLink) {
    Document doc = Jsoup.parse(body);
    Document.OutputSettings settings = doc.outputSettings();
    settings.prettyPrint(false);
    Elements elements = doc.getElementsMatchingOwnText(pattern);
    for (Element el : elements) {
        if (isIgnoreElement(el)) {
            continue;
        }
        List<TextNode> textNodeList = el.textNodes();
        for (TextNode node : textNodeList) {
            String result = convertLink(node.text(), pattern, toLink);
            node.text(StringUtils.EMPTY);
            node.after(result);
        }
    }
    this.body = doc.body().html();
    return this;
}

Example 64

Project: asta4d-master File: Asta4DTagSupportHtmlTreeBuilder.java View source code

void insert(Token.Character characterToken) {
    Node node;
    // characters in script and style go in as datanodes, not text nodes
    String tagName = currentElement().tagName();
    if (tagName.equals("script") || tagName.equals("style"))
        node = new DataNode(characterToken.getData(), baseUri);
    else
        node = new TextNode(characterToken.getData(), baseUri);
    // doesn't use insertNode, because we don't foster these; and will always have a stack.
    currentElement().appendChild(node);
}

Example 65

Project: LTB-android-master File: LTCScraper.java View source code

public ArrayList<Prediction> getPredictions(LTCRoute route, String stopNumber, ScrapeStatus scrapeStatus) {
    // usually get 3 of them
    ArrayList<Prediction> predictions = new ArrayList<Prediction>(3);
    Resources res = context.getResources();
    try {
        Calendar now = Calendar.getInstance();
        now.set(Calendar.SECOND, 0);
        // now we have 'now' set to the current time
        now.set(Calendar.MILLISECOND, 0);
        Document doc = parseDocFromUri(proxyPredictionPath(route, stopNumber), ltcPredictionPath(route, stopNumber), INITIAL_FETCH_TIMEOUT);
        Elements divs = doc.select("div");
        if (divs.size() == 0) {
            throw new ScrapeException("LTC down?", ScrapeStatus.PROBLEM_IMMEDIATELY, true);
        }
        //Log.i("GP", String.format("rows=%d", timeRows.size()));
        for (Element div : divs) {
            //Log.i("GP", String.format("cols=%d", cols.size()));
            List<TextNode> textNodes = div.textNodes();
            for (TextNode node : textNodes) {
                String text = node.text();
                Matcher noBusMatcher = NO_BUS_PATTERN.matcher(text);
                if (noBusMatcher.find()) {
                    throw new ScrapeException(res.getString(R.string.no_further), ScrapeStatus.PROBLEM_IF_ALL, false);
                }
                Matcher noStopMatcher = NO_INFO_PATTERN.matcher(text);
                if (noStopMatcher.find()) {
                    throw new ScrapeException(res.getString(R.string.no_service), ScrapeStatus.PROBLEM_IF_ALL, false);
                }
                Matcher arrivalMatcher = ARRIVAL_PATTERN.matcher(text);
                while (arrivalMatcher.find()) {
                    String textTime = arrivalMatcher.group(1);
                    String destination = arrivalMatcher.group(2);
                    predictions.add(new Prediction(route, textTime, destination, now));
                }
            }
        }
        if (predictions.size() == 0) {
            throw new ScrapeException(res.getString(R.string.no_bus), ScrapeStatus.PROBLEM_IF_ALL, true);
        }
        scrapeStatus.setStatus(ScrapeStatus.OK, ScrapeStatus.NOT_PROBLEM, null);
    } catch (ScrapeException e) {
        scrapeStatus.setStatus(ScrapeStatus.FAILED, e.problemType, e.getMessage());
        predictions.add(new Prediction(route, e.getMessage(), e.seriousProblem));
    } catch (SocketTimeoutException e) {
        scrapeStatus.setStatus(ScrapeStatus.FAILED, ScrapeStatus.PROBLEM_IMMEDIATELY, e.getMessage());
        predictions.add(new Prediction(context, route, R.string.times_timeout, true));
    } catch (IOException e) {
        scrapeStatus.setStatus(ScrapeStatus.FAILED, ScrapeStatus.PROBLEM_IMMEDIATELY, e.getMessage());
        predictions.add(new Prediction(context, route, R.string.times_fail, true));
    }
    return predictions;
}

Example 66

Project: opacclient-master File: SISIS.java View source code

public SearchRequestResult parse_search(String html, int page) throws OpacErrorException, SingleResultFound {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url + "/searchfoo");
    if (doc.select(".error").size() > 0) {
        throw new OpacErrorException(doc.select(".error").text().trim());
    } else if (doc.select(".nohits").size() > 0) {
        throw new OpacErrorException(doc.select(".nohits").text().trim());
    } else if (doc.select(".box-header h2, #nohits").text().contains("keine Treffer")) {
        return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1);
    }
    int results_total = -1;
    String resultnumstr = doc.select(".box-header h2").first().text();
    if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) {
        throw new SingleResultFound();
    } else if (resultnumstr.contains("(")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1"));
    } else if (resultnumstr.contains(": ")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1"));
    }
    Elements table = doc.select("table.data tbody tr");
    identifier = null;
    Elements links = doc.select("table.data a");
    boolean haslink = false;
    for (int i = 0; i < links.size(); i++) {
        Element node = links.get(i);
        if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) {
            haslink = true;
            try {
                List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(node.attr("href").replace(" ", "%20").replace("&", "&")), ENCODING);
                for (NameValuePair nv : anyurl) {
                    if (nv.getName().equals("identifier")) {
                        identifier = nv.getValue();
                        break;
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    List<SearchResult> results = new ArrayList<>();
    for (int i = 0; i < table.size(); i++) {
        Element tr = table.get(i);
        SearchResult sr = new SearchResult();
        if (tr.select("td img[title]").size() > 0) {
            String title = tr.select("td img").get(0).attr("title");
            String[] fparts = tr.select("td img").get(0).attr("src").split("/");
            String fname = fparts[fparts.length - 1];
            MediaType default_by_fname = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", ""));
            MediaType default_by_title = defaulttypes.get(title);
            MediaType default_name = default_by_title != null ? default_by_title : default_by_fname;
            if (data.has("mediatypes")) {
                try {
                    sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname)));
                } catch (JSONExceptionIllegalArgumentException |  e) {
                    sr.setType(default_name);
                }
            } else {
                sr.setType(default_name);
            }
        }
        String alltext = tr.text();
        if (alltext.contains("eAudio") || alltext.contains("eMusic")) {
            sr.setType(MediaType.MP3);
        } else if (alltext.contains("eVideo")) {
            sr.setType(MediaType.EVIDEO);
        } else if (alltext.contains("eBook")) {
            sr.setType(MediaType.EBOOK);
        } else if (alltext.contains("Munzinger")) {
            sr.setType(MediaType.EDOC);
        }
        if (tr.children().size() > 3 && tr.child(3).select("img[title*=cover]").size() == 1) {
            sr.setCover(tr.child(3).select("img[title*=cover]").attr("abs:src"));
            if (sr.getCover().contains("showCover.do")) {
                downloadCover(sr);
            }
        }
        Element middlething;
        if (tr.children().size() > 2 && tr.child(2).select("a").size() > 0) {
            middlething = tr.child(2);
        } else {
            middlething = tr.child(1);
        }
        List<Node> children = middlething.childNodes();
        if (middlething.select("div").not("#hlrightblock,.bestellfunktionen").size() == 1) {
            Element indiv = middlething.select("div").not("#hlrightblock,.bestellfunktionen").first();
            if (indiv.children().size() > 1) {
                children = indiv.childNodes();
            }
        } else if (middlething.select("span.titleData").size() == 1) {
            children = middlething.select("span.titleData").first().childNodes();
        }
        int childrennum = children.size();
        List<String[]> strings = new ArrayList<>();
        for (int ch = 0; ch < childrennum; ch++) {
            Node node = children.get(ch);
            if (node instanceof TextNode) {
                String text = ((TextNode) node).text().trim();
                if (text.length() > 3) {
                    strings.add(new String[] { "text", "", text });
                }
            } else if (node instanceof Element) {
                List<Node> subchildren = node.childNodes();
                for (int j = 0; j < subchildren.size(); j++) {
                    Node subnode = subchildren.get(j);
                    if (subnode instanceof TextNode) {
                        String text = ((TextNode) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") });
                        }
                    } else if (subnode instanceof Element) {
                        String text = ((Element) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") });
                        }
                    }
                }
            }
        }
        StringBuilder description = null;
        if (tr.select("span.Z3988").size() == 1) {
            // Sometimes there is a <span class="Z3988"> item which provides
            // data in a standardized format.
            List<NameValuePair> z3988data;
            boolean hastitle = false;
            try {
                description = new StringBuilder();
                z3988data = URLEncodedUtils.parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8");
                for (NameValuePair nv : z3988data) {
                    if (nv.getValue() != null) {
                        if (!nv.getValue().trim().equals("")) {
                            if (nv.getName().equals("rft.btitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.atitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.au")) {
                                description.append("<br />").append(nv.getValue());
                            } else if (nv.getName().equals("rft.date")) {
                                description.append("<br />").append(nv.getValue());
                            }
                        }
                    }
                }
            } catch (URISyntaxException e) {
                description = null;
            }
        }
        boolean described = false;
        if (description != null && description.length() > 0) {
            sr.setInnerhtml(description.toString());
            described = true;
        } else {
            description = new StringBuilder();
        }
        int k = 0;
        boolean yearfound = false;
        boolean titlefound = false;
        boolean sigfound = false;
        for (String[] part : strings) {
            if (!described) {
                if (part[0].equals("a") && (k == 0 || !titlefound)) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append("<b>").append(part[2]).append("</b>");
                    titlefound = true;
                } else if (part[2].matches("\\D*[0-9]{4}\\D*") && part[2].length() <= 10) {
                    yearfound = true;
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound) {
                    description.append("<br />");
                    description.append(part[2]);
                } else if (k > 1 && k < 4 && !sigfound && part[0].equals("text") && part[2].matches("^[A-Za-z0-9,\\- ]+$")) {
                    description.append("<br />");
                    description.append(part[2]);
                }
            }
            if (part.length == 4) {
                if (part[0].equals("span") && part[3].equals("textgruen")) {
                    sr.setStatus(SearchResult.Status.GREEN);
                } else if (part[0].equals("span") && part[3].equals("textrot")) {
                    sr.setStatus(SearchResult.Status.RED);
                }
            } else if (part.length == 5) {
                if (part[4].contains("purple")) {
                    sr.setStatus(SearchResult.Status.YELLOW);
                }
            }
            if (sr.getStatus() == null) {
                if ((part[2].contains("entliehen") && part[2].startsWith("Vormerkung ist leider nicht mÃ¶glich")) || part[2].contains("nur in anderer Zweigstelle ausleihbar und nicht bestellbar")) {
                    sr.setStatus(SearchResult.Status.RED);
                } else if (part[2].startsWith("entliehen") || part[2].contains("Ein Exemplar finden Sie in einer anderen Zweigstelle")) {
                    sr.setStatus(SearchResult.Status.YELLOW);
                } else if ((part[2].startsWith("bestellbar") && !part[2].contains("nicht bestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vormerkbar") && !part[2].contains("nicht vormerkbar")) || (part[2].contains("heute zurÃ¼ckgebucht")) || (part[2].contains("ausleihbar") && !part[2].contains("nicht ausleihbar"))) {
                    sr.setStatus(SearchResult.Status.GREEN);
                }
                if (sr.getType() != null) {
                    if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO) || sr.getType().equals(MediaType.MP3)) // Especially Onleihe.de ebooks are often marked
                    // green though they are not available.
                    {
                        sr.setStatus(SearchResult.Status.UNKNOWN);
                    }
                }
            }
            k++;
        }
        if (!described) {
            sr.setInnerhtml(description.toString());
        }
        sr.setNr(10 * (page - 1) + i);
        sr.setId(null);
        results.add(sr);
    }
    resultcount = results.size();
    return new SearchRequestResult(results, results_total, page);
}

Example 67

Project: sitebricks-master File: HtmlTemplateCompiler.java View source code

/**
     * Walks the DOM recursively, and converts elements into corresponding sitebricks widgets.
     */
@NotNull
private <N extends Node> WidgetChain walk(PageCompilingContext pc, N node) {
    WidgetChain widgetChain = Chains.proceeding();
    for (Node n : node.childNodes()) {
        if (n instanceof Element) {
            final Element child = (Element) n;
            //push form if this is a form tag
            if (child.tagName().equals("form"))
                pc.form = (Element) n;
            //setup a lexical scope if we're going into a repeat widget (by reading the previous node)
            final boolean shouldPopScope = lexicalClimb(pc, child);
            //continue recursing down, perform a post-order, depth-first traversal of the DOM
            WidgetChain childsChildren;
            try {
                childsChildren = walk(pc, child);
                //process the widget itself into a Renderable with child tree
                widgetChain.addWidget(widgetize(pc, child, childsChildren));
            } finally {
                lexicalDescend(pc, child, shouldPopScope);
            }
        } else if (n instanceof TextNode) {
            TextNode child = (TextNode) n;
            Renderable textWidget;
            //setup a lexical scope if we're going into a repeat widget (by reading the previous node)
            final boolean shouldPopScope = lexicalClimb(pc, child);
            // construct the text widget
            try {
                textWidget = registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek());
                // if there are no annotations, add the text widget to the chain
                if (!child.hasAttr(ANNOTATION_KEY)) {
                    widgetChain.addWidget(textWidget);
                } else {
                    // construct a new widget chain for this text node 
                    WidgetChain childsChildren = Chains.proceeding().addWidget(textWidget);
                    // make a new widget for the annotation, making the text chain the child
                    String widgetName = child.attr(ANNOTATION_KEY).toLowerCase();
                    Renderable annotationWidget = registry.newWidget(widgetName, child.attr(ANNOTATION_CONTENT), childsChildren, pc.lexicalScopes.peek());
                    widgetChain.addWidget(annotationWidget);
                }
            } catch (ExpressionCompileException e) {
                pc.errors.add(CompileError.in(node.outerHtml()).near(line(n)).causedBy(e));
            }
            if (shouldPopScope)
                pc.lexicalScopes.pop();
        } else if ((n instanceof Comment) || (n instanceof DataNode)) {
            //process as raw text widget
            try {
                widgetChain.addWidget(registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek()));
            } catch (ExpressionCompileException e) {
                pc.errors.add(CompileError.in(node.outerHtml()).near(line(node)).causedBy(e));
            }
        } else if (n instanceof XmlDeclaration) {
            try {
                widgetChain.addWidget(registry.xmlDirectiveWidget(((XmlDeclaration) n).getWholeDeclaration(), pc.lexicalScopes.peek()));
            } catch (ExpressionCompileException e) {
                pc.errors.add(CompileError.in(node.outerHtml()).near(line(node)).causedBy(e));
            }
        }
    }
    //return computed chain, or a terminal
    return widgetChain;
}

Example 68

Project: MyTv-master File: TvMaoCrawler.java View source code

/**
	 * è§£æž?ç”µè§†èŠ‚ç›®è¡¨
	 * 
	 * @param html
	 * @return
	 */
private List<ProgramTable> parseProgramTable(String html) {
    Document doc = Jsoup.parse(html);
    Elements dateElements = doc.select("div.pgmain div[class=\"mt10 clear\"] b:first-child");
    String dateAndWeek = dateElements.get(0).text().trim();
    String[] dateAndWeekArray = dateAndWeek.split("\\s+");
    String date = Calendar.getInstance().get(Calendar.YEAR) + "-" + dateAndWeekArray[0];
    String weekString = dateAndWeekArray[1];
    int week = weekStringToInt(weekString);
    Elements stationElements = doc.select("aside[class=\"related-aside rt\"] section[class=\"aside-section clear\"] div.bar");
    String stationName = stationElements.get(0).text().trim();
    Elements programElements = doc.select("ul#pgrow li");
    List<ProgramTable> resultList = new ArrayList<ProgramTable>();
    for (Element element : programElements) {
        List<Node> children = element.childNodes();
        int size = children.size();
        if (size < 2) {
            continue;
        }
        int i = 0;
        // æŸ¥æ‰¾èŠ‚ç›®æ’å‡ºæ—¶é—´
        boolean foundAirTime = false;
        for (; i < size; i++) {
            Node child = children.get(i);
            if (child instanceof Element && "SPAN".equalsIgnoreCase(((Element) child).tagName())) {
                foundAirTime = true;
                break;
            }
        }
        if (!foundAirTime) {
            logger.info("the program table of " + stationName + " at " + date + " does not exists.");
            return resultList;
        }
        String airTime = ((Element) children.get(i++)).text().trim();
        StringBuffer program = new StringBuffer();
        // æŸ¥æ‰¾èŠ‚ç›®å??ç§°
        for (; i < size; i++) {
            Node child = children.get(i);
            if (child instanceof TextNode) {
                program.append(((TextNode) child).text().trim());
            } else if (child instanceof Element && "A".equalsIgnoreCase(((Element) child).tagName())) {
                program.append(((Element) child).text().trim());
                i++;
                break;
            }
        }
        if (i < size - 1) {
            // è¿˜æœ‰textnodeå…ƒç´ 
            Node child = children.get(i);
            if (child instanceof TextNode) {
                program.append(((TextNode) child).text().trim());
            }
        }
        ProgramTable pt = new ProgramTable();
        pt.setAirDate(date);
        pt.setAirTime(date + " " + airTime);
        pt.setProgram(program.toString().trim());
        pt.setStationName(stationName);
        pt.setWeek(week);
        for (CrawlEventListener listener : listeners) {
            listener.itemFound(new ProgramTableFoundEvent(this, pt));
        }
        resultList.add(pt);
    }
    return resultList;
}

Example 69

Project: ScreenSlicer-master File: CommonUtil.java View source code

private static Element sanitize(Document doc, final boolean ascii) {
    if (ascii) {
        doc.outputSettings().charset("ascii");
    } else {
        doc.outputSettings().charset("utf-8");
    }
    doc.traverse(new NodeVisitor() {

        @Override
        public void tail(Node n, int d) {
        }

        @Override
        public void head(Node n, int d) {
            try {
                if (n.nodeName().equals("#text") && !CommonUtil.isEmpty(n.outerHtml())) {
                    ((TextNode) n).text(HtmlCoder.decode(n.toString()));
                }
            } catch (Throwable t) {
                Log.exception(t);
            }
        }
    });
    return doc;
}

Example 70

Project: slicer-master File: CommonUtil.java View source code

private static Element sanitize(Document doc, final boolean ascii) {
    if (ascii) {
        doc.outputSettings().charset("ascii");
    } else {
        doc.outputSettings().charset("utf-8");
    }
    doc.traverse(new NodeVisitor() {

        @Override
        public void tail(Node n, int d) {
        }

        @Override
        public void head(Node n, int d) {
            try {
                if (n.nodeName().equals("#text") && !CommonUtil.isEmpty(n.outerHtml())) {
                    ((TextNode) n).text(HtmlCoder.decode(n.toString()));
                }
            } catch (Throwable t) {
                Log.exception(t);
            }
        }
    });
    return doc;
}

Example 71

Project: GameRaven-master File: AllInOneV2.java View source code

@SuppressLint("SetJavaScriptEnabled")
public void processContent(NetDesc desc, Document doc, String resUrl) {
    if (BuildConfig.DEBUG)
        wtl("GRAIO hNR fired, desc: " + desc.name());
    swipeRefreshLayout.setEnabled(false);
    if (searchIcon != null)
        searchIcon.collapseActionView();
    setAllMenuItemsExceptRefreshVisibility(false);
    adapterRows.clear();
    boolean isDefaultAcc = Session.getUser() != null && Session.getUser().equals(settings.getString("defaultAccount", HeaderSettings.NO_DEFAULT_ACCOUNT));
    if (BuildConfig.DEBUG)
        wtl("setting board, topic, message id to null");
    boardID = null;
    topicID = null;
    messageIDForEditing = null;
    Element tbody;
    Element pj;
    String headerTitle;
    String firstPage = null;
    String prevPage = null;
    int[] pagesInfo = new int[] { 1, 1 };
    String nextPage = null;
    String lastPage = null;
    String pagePrefix = null;
    if (BuildConfig.DEBUG)
        wtl("checking for board quick list");
    Element boardsDropdown = null;
    for (Element e : doc.select("ul.masthead_mygames_subnav")) {
        if (e.previousElementSibling().ownText().equals("My Boards")) {
            boardsDropdown = e;
            break;
        }
    }
    if (boardsDropdown != null) {
        Elements dItems = boardsDropdown.getElementsByTag("a");
        boardQuickListOptions = new String[dItems.size() + 1];
        boardQuickListLinks = new String[dItems.size() + 1];
        boardQuickListOptions[0] = "Go to Boards Page...";
        int x = 1;
        for (Element e : dItems) {
            boardQuickListOptions[x] = e.text();
            boardQuickListLinks[x] = e.attr("href");
            x++;
        }
    }
    contentList.setDividerHeight(Theming.convertDPtoPX(this, 1));
    switch(desc) {
        case BOARD_JUMPER:
        case LOGIN_S2:
            updateHeaderNoJumper("Board Jumper", NetDesc.BOARD_JUMPER);
            setMenuItemVisibility(searchIcon, true);
            processBoards(doc);
            break;
        case BOARD_LIST:
            updateHeaderNoJumper(doc.getElementsByTag("th").get(4).text(), NetDesc.BOARD_LIST);
            processBoards(doc);
            break;
        case NOTIFS_PAGE:
            settings.edit().putLong("notifsLastCheck", System.currentTimeMillis()).apply();
            tbody = doc.getElementsByTag("tbody").first();
            headerTitle = Session.getUser() + "'s Notifications";
            updateHeaderNoJumper(headerTitle, desc);
            if (tbody != null) {
                for (Element row : tbody.getElementsByTag("tr")) {
                    Elements cells = row.children();
                    // [title, url] [time] [read]
                    Element titleLinkElem = cells.get(0).children().first();
                    String title = titleLinkElem.text();
                    String link = titleLinkElem.attr("href");
                    String time = cells.get(1).text();
                    boolean isOld = false;
                    if (cells.get(2).text().equals("Read"))
                        isOld = true;
                    adapterRows.add(new NotifRowData(title, time, link, isOld));
                }
            } else {
                adapterRows.add(new HeaderRowData("You have no notifications at this time."));
            }
            setMenuItemVisibility(clearUnreadNotifsIcon, true);
            NotifierService.notifDismiss(this);
            break;
        case MENTIONS_PAGE:
            tbody = doc.getElementsByTag("tbody").first();
            headerTitle = Session.getUser() + "'s Mentions";
            updateHeaderNoJumper(headerTitle, desc);
            if (tbody != null) {
                for (Element row : tbody.getElementsByTag("tr")) {
                    Elements cells = row.children();
                    // [topic] [board] [user] [time]
                    Element topicLinkElem = cells.get(0).children().first();
                    String topic = topicLinkElem.text();
                    String link = topicLinkElem.attr("href");
                    String board = cells.get(1).text();
                    String user = cells.get(2).text();
                    String time = cells.get(3).text();
                    adapterRows.add(new MentionRowData(topic, board, user, time, link));
                }
            } else {
                adapterRows.add(new HeaderRowData("You have no mentions at this time."));
            }
            break;
        case PM_INBOX:
        case PM_OUTBOX:
            tbody = doc.getElementsByTag("tbody").first();
            boolean isInbox = false;
            if (desc == NetDesc.PM_INBOX)
                isInbox = true;
            if (isInbox)
                headerTitle = Session.getUser() + "'s PM Inbox";
            else
                headerTitle = Session.getUser() + "'s PM Outbox";
            if (tbody != null) {
                pj = doc.select("ul.paginate").first();
                if (pj != null) {
                    pagesInfo = getPageJumperInfo(pj);
                    if (isInbox)
                        pagePrefix = "/pm/?page=";
                    else
                        pagePrefix = "/pm/sent?page=";
                    if (pagesInfo[0] > 1) {
                        firstPage = pagePrefix + 0;
                        prevPage = pagePrefix + (pagesInfo[0] - 2);
                    }
                    if (pagesInfo[0] != pagesInfo[1]) {
                        nextPage = pagePrefix + pagesInfo[0];
                        lastPage = pagePrefix + (pagesInfo[1] - 1);
                    }
                }
                updateHeader(headerTitle, firstPage, prevPage, pagesInfo[0], pagesInfo[1], nextPage, lastPage, pagePrefix, desc);
                for (Element row : tbody.getElementsByTag("tr")) {
                    Elements cells = row.children();
                    // [icon] [sender] [subject] [time] [check]
                    boolean isOld = true;
                    if (cells.get(0).children().first().hasClass("fa-circle"))
                        isOld = false;
                    String sender = cells.get(1).text();
                    Element subjectLinkElem = cells.get(2).children().first();
                    String subject = subjectLinkElem.text();
                    String link = subjectLinkElem.attr("href");
                    String time = cells.get(3).text();
                    adapterRows.add(new PMRowData(subject, sender, time, link, isOld, isInbox));
                }
            } else {
                updateHeaderNoJumper(headerTitle, desc);
                adapterRows.add(new HeaderRowData("There are no private messages here at this time."));
            }
            fab.setVisibility(View.VISIBLE);
            pMode = PostMode.NEW_PM;
            if (isInbox)
                setMenuItemVisibility(pmOutboxIcon, true);
            else
                setMenuItemVisibility(pmInboxIcon, true);
            break;
        case PM_INBOX_DETAIL:
        case PM_OUTBOX_DETAIL:
            String pmTitle = doc.select("h2.title").first().text();
            String pmMessage = doc.select("div.body").first().outerHtml();
            Element foot = doc.select("div.foot").first();
            foot.child(1).remove();
            String pmFoot = foot.outerHtml();
            //Sent by: P4wn4g3 on 6/1/2013 2:15:55 PM
            String footText = foot.text();
            String sender = footText.substring(9, footText.indexOf(" on "));
            updateHeaderNoJumper(pmTitle, desc);
            if (desc == NetDesc.PM_INBOX_DETAIL) {
                replyTo = sender;
                if (!pmTitle.startsWith("Re: "))
                    replySubject = "Re: " + pmTitle;
                else
                    replySubject = pmTitle;
                setMenuItemVisibility(replyIcon, true);
            }
            adapterRows.add(new PMDetailRowData(sender, pmTitle, pmMessage + pmFoot));
            break;
        case AMP_LIST:
            if (BuildConfig.DEBUG)
                wtl("GRAIO hNR determined this is an amp response");
            tbody = doc.getElementsByTag("tbody").first();
            headerTitle = Session.getUser() + "'s Active Messages";
            if (doc.select("ul.paginate").size() > 1) {
                pj = doc.select("ul.paginate").get(1);
                if (pj != null && !pj.hasClass("user") && !pj.hasClass("tsort")) {
                    pagesInfo = getPageJumperInfo(pj);
                    pagePrefix = buildAMPLink() + "&page=";
                    if (pagesInfo[0] > 1) {
                        firstPage = pagePrefix + 0;
                        prevPage = pagePrefix + (pagesInfo[0] - 2);
                    }
                    if (pagesInfo[0] != pagesInfo[1]) {
                        nextPage = pagePrefix + pagesInfo[0];
                        lastPage = pagePrefix + (pagesInfo[1] - 1);
                    }
                }
            }
            updateHeader(headerTitle, firstPage, prevPage, pagesInfo[0], pagesInfo[1], nextPage, lastPage, pagePrefix, NetDesc.AMP_LIST);
            if (!tbody.children().isEmpty()) {
                for (Element row : tbody.children()) {
                    // [board] [read status] [title] [msg] [last post] [your last post]
                    Elements cells = row.children();
                    String board = cells.get(0).text();
                    Element titleLinkElem = cells.get(2).child(0);
                    String title = titleLinkElem.text();
                    String link = titleLinkElem.attr("href");
                    String mCount = cells.get(3).textNodes().get(0).text().trim();
                    Element lPostLinkElem = cells.get(4).child(1);
                    String lPost = lPostLinkElem.text();
                    String lPostLink = lPostLinkElem.attr("href");
                    ReadStatus status = ReadStatus.UNREAD;
                    String tImg = cells.get(1).child(0).className();
                    if (tImg.endsWith("_read"))
                        status = ReadStatus.READ;
                    else if (tImg.endsWith("_unread")) {
                        status = ReadStatus.NEW_POST;
                        lPostLink = cells.get(1).child(0).attr("href");
                    }
                    adapterRows.add(new AMPRowData(title, board, lPost, mCount, link, lPostLink, status));
                }
            } else {
                adapterRows.add(new HeaderRowData("You have no active messages at this time."));
            }
            if (BuildConfig.DEBUG)
                wtl("amp response block finished");
            break;
        case TRACKED_TOPICS:
            headerTitle = Session.getUser() + "'s Tracked Topics";
            updateHeaderNoJumper(headerTitle, desc);
            tbody = doc.getElementsByTag("tbody").first();
            if (tbody != null) {
                for (Element row : tbody.children()) {
                    // [remove] [title] [board name] [msgs] [last [pst]
                    Elements cells = row.children();
                    int rsMod = 0;
                    if (cells.size() == 6)
                        rsMod = 1;
                    String removeLink = cells.get(0).child(0).attr("href");
                    String topicLink = cells.get(1 + rsMod).child(0).attr("href");
                    String topicText = cells.get(1 + rsMod).text();
                    String board = cells.get(2 + rsMod).text();
                    String msgs = cells.get(3 + rsMod).text();
                    String lPostLink = cells.get(4 + rsMod).child(0).attr("href");
                    String lPostText = cells.get(4 + rsMod).text();
                    ReadStatus status = ReadStatus.UNREAD;
                    if (rsMod == 1) {
                        String tImg = cells.get(1).child(0).className();
                        if (tImg.endsWith("_read"))
                            status = ReadStatus.READ;
                        else if (tImg.endsWith("_unread"))
                            status = ReadStatus.NEW_POST;
                    }
                    adapterRows.add(new TrackedTopicRowData(board, topicText, lPostText, msgs, topicLink, removeLink, lPostLink, status));
                }
            } else {
                adapterRows.add(new HeaderRowData("You have no tracked topics at this time."));
            }
            break;
        case BOARD:
            if (BuildConfig.DEBUG)
                wtl("GRAIO hNR determined this is a board response");
            if (BuildConfig.DEBUG)
                wtl("setting board id");
            boardID = parseBoardID(resUrl);
            boolean isSplitList = false;
            if (doc.getElementsByTag("th").first() != null) {
                if (doc.getElementsByTag("th").first().text().equals("Board Title")) {
                    if (BuildConfig.DEBUG)
                        wtl("is actually a split board list");
                    updateHeaderNoJumper(doc.select("h1.page-title").first().text(), NetDesc.BOARD);
                    processBoards(doc);
                    isSplitList = true;
                }
            }
            if (!isSplitList) {
                String searchQuery = EMPTY_STRING;
                String searchPJAddition = EMPTY_STRING;
                if (resUrl.contains("search=")) {
                    if (BuildConfig.DEBUG)
                        wtl("board search url: " + resUrl);
                    searchQuery = resUrl.substring(resUrl.indexOf("search=") + 7);
                    int i = searchQuery.indexOf('&');
                    if (i != -1)
                        searchQuery = searchQuery.replace(searchQuery.substring(i), EMPTY_STRING);
                    searchPJAddition = "&search=" + searchQuery;
                    try {
                        searchQuery = URLDecoder.decode(searchQuery, DocumentParser.CHARSET_NAME);
                    } catch (UnsupportedEncodingException e) {
                        throw new AssertionError(DocumentParser.CHARSET_NAME + " is unknown");
                    }
                }
                Element headerElem = doc.getElementsByClass("page-title").first();
                if (headerElem != null)
                    headerTitle = headerElem.text();
                else
                    headerTitle = "GFAQs Cache Error, Board Title Not Found";
                if (searchQuery.length() > 0)
                    headerTitle += " (search: " + searchQuery + ")";
                if (doc.select("ul.paginate").size() > 1) {
                    pj = doc.select("ul.paginate").get(1);
                    if (pj != null && !pj.hasClass("user")) {
                        pagesInfo = getPageJumperInfo(pj);
                        pagePrefix = "boards/" + boardID + "?page=";
                        if (pagesInfo[0] > 1) {
                            firstPage = pagePrefix + 0 + searchPJAddition;
                            prevPage = pagePrefix + (pagesInfo[0] - 2) + searchPJAddition;
                        }
                        if (pagesInfo[0] != pagesInfo[1]) {
                            nextPage = pagePrefix + pagesInfo[0] + searchPJAddition;
                            lastPage = pagePrefix + (pagesInfo[1] - 1) + searchPJAddition;
                            if (pagesInfo[0] > pagesInfo[1]) {
                                session.forceNoHistoryAddition();
                                session.forceSkipAIOCleanup();
                                Crouton.showText(this, "Page count higher than page amount, going to last page...", Theming.croutonStyle());
                                session.get(NetDesc.BOARD, lastPage);
                                return;
                            }
                        }
                    }
                }
                updateHeader(headerTitle, firstPage, prevPage, pagesInfo[0], pagesInfo[1], nextPage, lastPage, pagePrefix + searchPJAddition, NetDesc.BOARD);
                setMenuItemVisibility(searchIcon, true);
                if (Session.isLoggedIn()) {
                    Element favbtn = doc.getElementsByClass("user").first().getElementsByAttributeValueStarting("onclick", "post_click").first();
                    if (favbtn != null) {
                        String favtext = favbtn.text().toLowerCase();
                        String onclick = favbtn.attr("onclick");
                        int endPoint = onclick.lastIndexOf('\'');
                        int startPoint = onclick.lastIndexOf('\'', endPoint - 1) + 1;
                        favKey = onclick.substring(startPoint, endPoint);
                        fMode = FavMode.ON_BOARD;
                        if (favtext.contains("add to favorites"))
                            setMenuItemVisibility(addFavIcon, true);
                        else if (favtext.contains("remove favorite"))
                            setMenuItemVisibility(remFavIcon, true);
                    }
                    updatePostingRights(doc, false);
                }
                Element splitList = doc.select("p:contains(this is a split board)").first();
                if (splitList != null) {
                    String splitListLink = splitList.child(0).attr("href");
                    adapterRows.add(new BoardRowData("This is a Split Board.", "Click here to return to the Split List.", null, null, null, splitListLink, BoardType.SPLIT));
                }
                Element table = doc.select("table.board").first();
                if (table != null && !table.select("td").first().hasAttr("colspan")) {
                    table.getElementsByTag("col").get(2).remove();
                    table.getElementsByTag("th").get(2).remove();
                    table.getElementsByTag("col").get(0).remove();
                    table.getElementsByTag("th").get(0).remove();
                    if (BuildConfig.DEBUG)
                        wtl("board row parsing start");
                    boolean skipFirst = true;
                    Set<String> hlUsers = hlDB.getHighlightedUsers().keySet();
                    for (Element row : table.getElementsByTag("tr")) {
                        if (!skipFirst) {
                            Elements cells = row.getElementsByTag("td");
                            // cells = [image] [title] [author] [post count] [last post]
                            String tImg = cells.get(0).child(0).className();
                            Element titleLinkElem = cells.get(1).child(0);
                            String title = titleLinkElem.text();
                            String tUrl = titleLinkElem.attr("href");
                            String tc = cells.get(2).text();
                            Element lPostLinkElem = cells.get(4).child(0);
                            String lastPost = lPostLinkElem.text();
                            String lpUrl = lPostLinkElem.attr("href");
                            String mCount = cells.get(3).text();
                            TopicType type = TopicType.NORMAL;
                            if (tImg.contains("poll"))
                                type = TopicType.POLL;
                            else if (tImg.contains("sticky"))
                                type = TopicType.PINNED;
                            else if (tImg.contains("closed"))
                                type = TopicType.LOCKED;
                            else if (tImg.contains("archived"))
                                type = TopicType.ARCHIVED;
                            if (BuildConfig.DEBUG)
                                wtl(tImg + ", " + type.name());
                            ReadStatus status = ReadStatus.UNREAD;
                            if (tImg.endsWith("_read"))
                                status = ReadStatus.READ;
                            else if (tImg.endsWith("_unread")) {
                                status = ReadStatus.NEW_POST;
                                lpUrl = cells.get(0).child(0).attr("href");
                            }
                            int hlColor = 0;
                            if (hlUsers.contains(tc.toLowerCase(Locale.US))) {
                                HighlightedUser hUser = hlDB.getHighlightedUsers().get(tc.toLowerCase(Locale.US));
                                hlColor = hUser.getColor();
                                tc += " (" + hUser.getLabel() + ")";
                            }
                            adapterRows.add(new TopicRowData(title, tc, lastPost, mCount, tUrl, lpUrl, type, status, hlColor));
                        } else
                            skipFirst = false;
                    }
                    if (BuildConfig.DEBUG)
                        wtl("board row parsing end");
                } else {
                    adapterRows.add(new HeaderRowData("There are no topics at this time."));
                }
            }
            if (BuildConfig.DEBUG)
                wtl("board response block finished");
            break;
        case TOPIC:
            contentList.setDividerHeight(0);
            boardID = parseBoardID(resUrl);
            topicID = parseTopicID(resUrl);
            tlUrl = "boards/" + boardID;
            if (BuildConfig.DEBUG)
                wtl(tlUrl);
            setMenuItemVisibility(topicListIcon, true);
            Element headerElem = doc.getElementsByClass("title").first();
            if (headerElem != null)
                headerTitle = headerElem.text();
            else
                headerTitle = "GFAQs Cache Error, Title Not Found";
            if (headerTitle.equals("Log In to GameFAQs")) {
                headerElem = doc.getElementsByClass("title").get(1);
                if (headerElem != null)
                    headerTitle = headerElem.text();
            }
            if (doc.select("ul.paginate").size() > 1) {
                pj = doc.select("ul.paginate").get(1);
                if (pj != null && !pj.hasClass("user")) {
                    pagesInfo = getPageJumperInfo(pj);
                    pagePrefix = "boards/" + boardID + "/" + topicID + "?page=";
                    if (pagesInfo[0] > 1) {
                        firstPage = pagePrefix + 0;
                        prevPage = pagePrefix + (pagesInfo[0] - 2);
                    }
                    if (pagesInfo[0] != pagesInfo[1]) {
                        nextPage = pagePrefix + pagesInfo[0];
                        lastPage = pagePrefix + (pagesInfo[1] - 1);
                        if (pagesInfo[0] > pagesInfo[1]) {
                            session.forceNoHistoryAddition();
                            session.forceSkipAIOCleanup();
                            Crouton.showText(this, "Page count higher than page amount, going to last page...", Theming.croutonStyle());
                            session.get(NetDesc.TOPIC, lastPage);
                            return;
                        }
                    }
                }
            }
            updateHeader(headerTitle, firstPage, prevPage, pagesInfo[0], pagesInfo[1], nextPage, lastPage, pagePrefix, NetDesc.TOPIC);
            if (Session.isLoggedIn()) {
                Element favbtn = doc.getElementsByClass("user").first().getElementsByAttributeValueStarting("onclick", "post_click").first();
                if (favbtn != null) {
                    String favtext = favbtn.text().toLowerCase();
                    String onclick = favbtn.attr("onclick");
                    int endPoint = onclick.lastIndexOf('\'');
                    int startPoint = onclick.lastIndexOf('\'', endPoint - 1) + 1;
                    favKey = onclick.substring(startPoint, endPoint);
                    fMode = FavMode.ON_TOPIC;
                    if (favtext.contains("track topic"))
                        setMenuItemVisibility(addFavIcon, true);
                    else if (favtext.contains("stop tracking"))
                        setMenuItemVisibility(remFavIcon, true);
                }
                updatePostingRights(doc, true);
            }
            String goToThisPost = null;
            if (goToUrlDefinedPost) {
                if (resUrl.indexOf('#') != -1) {
                    goToThisPost = resUrl.substring(resUrl.indexOf('#'));
                } else {
                    // goToUrlDefinedPost is true when there is no url defined post, oops
                    goToUrlDefinedPost = false;
                }
            }
            Elements rows = doc.select("table.board").first().getElementsByTag("tr");
            int rowCount = rows.size();
            int msgIndex = 0;
            Set<String> hlUsers = hlDB.getHighlightedUsers().keySet();
            for (int x = 0; x < rowCount; x++) {
                Element row = rows.get(x);
                if (row.select("div.msg_deleted").isEmpty()) {
                    String user;
                    String postNum;
                    String postTime;
                    String mID = null;
                    String userTitles = EMPTY_STRING;
                    Element msgBody;
                    boolean canReport = false, canDelete = false, canEdit = false, canQuote = false;
                    Element infoBox = row.select("div.msg_infobox").first();
                    user = infoBox.getElementsByTag("b").first().text();
                    Element userInfo = infoBox.select("span.user_info").first();
                    if (userInfo != null)
                        userTitles = " " + userInfo.text();
                    Element userTag = infoBox.select("span.tag").first();
                    if (userTag != null)
                        userTitles += " (" + userTag.text() + ")";
                    postTime = infoBox.select("span.post_time").first().text();
                    Element number = infoBox.select("span.message_num").first();
                    postNum = number.text();
                    if (!number.children().isEmpty()) {
                        mID = parseMessageID(number.child(0).attr("href"));
                    }
                    msgBody = row.select("div.msg_body").first();
                    Element msgBelow = row.select("div.msg_below").first();
                    Element edited = msgBelow.select("span.edited").first();
                    if (edited != null)
                        userTitles += " (edited)";
                    Element belowOptions = msgBelow.select("span.options").first();
                    if (belowOptions != null) {
                        String options = belowOptions.text();
                        if (options.contains("report"))
                            canReport = true;
                        if (options.contains("delete"))
                            canDelete = true;
                        if (options.contains("edit"))
                            canEdit = true;
                        if (options.contains("quote"))
                            canQuote = true;
                    }
                    int hlColor = 0;
                    if (hlUsers.contains(user.toLowerCase(Locale.US))) {
                        HighlightedUser hUser = hlDB.getHighlightedUsers().get(user.toLowerCase(Locale.US));
                        hlColor = hUser.getColor();
                        userTitles += " (" + hUser.getLabel() + ")";
                    }
                    if (goToUrlDefinedPost) {
                        if (postNum.equals(goToThisPost))
                            goToThisIndex = msgIndex;
                    }
                    String avatarUrl = row.getElementsByClass("imgboxart").first().attr("src");
                    if (BuildConfig.DEBUG)
                        wtl("creating messagerowdata object");
                    adapterRows.add(new MessageRowData(user, userTitles, avatarUrl, postNum, postTime, msgBody, boardID, topicID, mID, hlColor, canReport, canDelete, canEdit, canQuote));
                } else {
                    String postNum = row.select("span.message_num").first().text();
                    if (goToUrlDefinedPost) {
                        if (postNum.equals(goToThisPost))
                            goToThisIndex = msgIndex;
                    }
                    adapterRows.add(new MessageRowData(true, postNum));
                }
                msgIndex++;
            }
            break;
        case MESSAGE_DETAIL:
            updateHeaderNoJumper("Message Detail", NetDesc.MESSAGE_DETAIL);
            boardID = parseBoardID(resUrl);
            topicID = parseTopicID(resUrl);
            String mID = parseMessageID(resUrl);
            Elements msgRows = doc.select("td.msg");
            adapterRows.add(new HeaderRowData("Current Version"));
            MessageRowData msg;
            int msgRowCount = msgRows.size();
            for (int x = 0; x < msgRowCount; x++) {
                if (x == 1)
                    adapterRows.add(new HeaderRowData("Previous Version(s)"));
                Element currRow = msgRows.get(x);
                Element msgInfobox = currRow.select("div.msg_infobox").first();
                Element msgBody = currRow.select("div.msg_body").first();
                String user = msgInfobox.getElementsByTag("b").first().text();
                String postTime = msgInfobox.select("span.post_time").first().text();
                msg = new MessageRowData(user, EMPTY_STRING, EMPTY_STRING, "#" + (msgRowCount - x), postTime, msgBody, boardID, topicID, mID, 0, false, false, false, false);
                msg.disableTopClick();
                adapterRows.add(msg);
            }
            break;
        case USER_TAG:
            if (BuildConfig.DEBUG)
                wtl("starting check for user tag success");
            Element error = doc.getElementsByClass("error").first();
            if (error == null) {
                Crouton.showText(this, "User tag updated successfully.", Theming.croutonStyle());
            } else {
                AlertDialog.Builder b = new AlertDialog.Builder(this);
                b.setTitle("There was an error tagging the user...");
                b.setMessage("Error message from GameFAQs:\n\n" + error.text());
                b.setPositiveButton("OK", null);
                b.show();
            }
        case USER_DETAIL:
            if (BuildConfig.DEBUG)
                wtl("starting user detail processing");
            tbody = doc.select("table.board").first().getElementsByTag("tbody").first();
            String name = null;
            String ID = null;
            String level = null;
            String creation = null;
            String lVisit = null;
            String sig = null;
            String karma = null;
            String AMP = null;
            String tagKey = null;
            String tagText = null;
            for (Element row : tbody.children()) {
                String label = row.child(0).text().toLowerCase(Locale.US);
                if (BuildConfig.DEBUG)
                    wtl("user detail row label: " + label);
                switch(label) {
                    case "user name":
                        name = row.child(1).text();
                        break;
                    case "user id":
                        ID = row.child(1).text();
                        break;
                    case "board user level":
                        level = row.child(1).html();
                        if (BuildConfig.DEBUG)
                            wtl("set level: " + level);
                        break;
                    case "account created":
                        creation = row.child(1).text();
                        break;
                    case "last visit":
                        lVisit = row.child(1).text();
                        break;
                    case "signature":
                        sig = row.child(1).html();
                        break;
                    case "karma":
                        karma = row.child(1).text();
                        break;
                    case "active messages posted":
                        AMP = row.child(1).text();
                        break;
                }
            }
            if (Session.isLoggedIn()) {
                Element pmIcon = doc.select("i.fa-envelope").last();
                if (pmIcon != null && pmIcon.attr("title").startsWith("Send a PM to"))
                    setMenuItemVisibility(sendUserPMIcon, true);
                setMenuItemVisibility(tagUserIcon, true);
                tagKey = doc.getElementsByAttributeValue("name", "key").attr("value");
                tagText = doc.getElementsByAttributeValue("name", "tag_text").attr("value");
                if (tagText == null)
                    tagText = "";
            }
            updateHeaderNoJumper(name + "'s Details", NetDesc.USER_DETAIL);
            userDetailData = new UserDetailRowData(name, ID, level, creation, lVisit, sig, karma, AMP, tagKey, tagText, resUrl);
            adapterRows.add(userDetailData);
            break;
        case GAME_SEARCH:
            if (BuildConfig.DEBUG)
                wtl("GRAIO hNR determined this is a game search response");
            if (BuildConfig.DEBUG)
                wtl("game search url: " + resUrl);
            String searchQuery = resUrl.substring(resUrl.indexOf("game=") + 5);
            int i = searchQuery.indexOf("&");
            if (i != -1)
                searchQuery = searchQuery.replace(searchQuery.substring(i), EMPTY_STRING);
            int pageIndex = resUrl.indexOf("page=");
            if (pageIndex != -1) {
                String currPage = resUrl.substring(pageIndex + 5);
                i = currPage.indexOf("&");
                if (i != -1)
                    currPage = currPage.replace(currPage.substring(i), EMPTY_STRING);
                pagesInfo[0] = Integer.parseInt(currPage) + 1;
            } else {
                pagesInfo[0] = 1;
            }
            if (pagesInfo[0] > 1) {
                firstPage = "/search/index.html?game=" + searchQuery + "&page=0";
                prevPage = "/search/index.html?game=" + searchQuery + "&page=" + (pagesInfo[0] - 2);
            }
            if (!doc.getElementsByClass("icon-angle-right").isEmpty()) {
                nextPage = "/search/index.html?game=" + searchQuery + "&page=" + (pagesInfo[0]);
            }
            try {
                headerTitle = "Searching games: " + URLDecoder.decode(searchQuery, DocumentParser.CHARSET_NAME) + EMPTY_STRING;
            } catch (UnsupportedEncodingException e) {
                throw new AssertionError(DocumentParser.CHARSET_NAME + " is unknown");
            }
            updateHeader(headerTitle, firstPage, prevPage, pagesInfo[0], -1, nextPage, lastPage, pagePrefix, NetDesc.GAME_SEARCH);
            setMenuItemVisibility(searchIcon, true);
            Elements gameSearchTables = doc.select("table.results");
            int tCount = gameSearchTables.size();
            int tCounter = 0;
            if (!gameSearchTables.isEmpty()) {
                for (Element table : gameSearchTables) {
                    tCounter++;
                    if (tCounter < tCount)
                        adapterRows.add(new HeaderRowData("Best Matches"));
                    else
                        adapterRows.add(new HeaderRowData("Good Matches"));
                    String prevPlatform = EMPTY_STRING;
                    if (BuildConfig.DEBUG)
                        wtl("board row parsing start");
                    for (Element row : table.getElementsByTag("tr")) {
                        if (row.parent().tagName().equals("tbody")) {
                            Elements cells = row.getElementsByTag("td");
                            // cells = [platform] [title] [faqs] [codes] [saves] [revs] [mygames] [q&a] [pics] [vids] [board]
                            String platform = cells.get(0).text();
                            String bName = cells.get(1).text();
                            String bUrl = cells.get(9).child(0).attr("href");
                            if (platform.codePointAt(0) == (' ')) {
                                platform = prevPlatform;
                            } else {
                                prevPlatform = platform;
                            }
                            adapterRows.add(new GameSearchRowData(bName, platform, bUrl));
                        }
                    }
                    if (BuildConfig.DEBUG)
                        wtl("board row parsing end");
                }
            } else {
                adapterRows.add(new HeaderRowData("No results."));
            }
            if (BuildConfig.DEBUG)
                wtl("game search response block finished");
            break;
        default:
            if (BuildConfig.DEBUG)
                wtl("GRAIO hNR determined response type is unhandled");
            getSupportActionBar().setTitle("Page unhandled - " + resUrl);
            break;
    }
    Element pmInboxLink = doc.select("i.fa-envelope").first();
    String pmButtonLabel = getString(R.string.pm_inbox);
    if (pmInboxLink != null) {
        pmButtonLabel += " " + ((TextNode) pmInboxLink.nextSibling()).text();
    }
    dwrPMInboxItem.setTitle(pmButtonLabel);
    Element notifsObject = doc.select("span.notifications").first();
    notifsAdapter.clear();
    notifsLinks.clear();
    notifsLinks.add("filler");
    String count = "0";
    if (notifsObject != null) {
        count = notifsObject.child(0).text();
        if (count.equals("1"))
            count = count + " " + getString(R.string.notification);
        else
            count = count + " " + getString(R.string.notifications);
        notifsAdapter.add(count);
        Elements notifsLines = notifsObject.getElementsByTag("li");
        notifsLines.remove(notifsLines.size() - 1);
        for (Element e : notifsLines) {
            notifsAdapter.add(e.text());
            notifsLinks.add(e.select("a").first().attr("href"));
        }
        notifsAdapter.add("View All");
        notifsLinks.add(NOTIFS_PAGE_LINK);
        notifsAdapter.add("Clear All");
        notifsLinks.add(NOTIFS_CLEAR_LINK);
        setMenuItemVisibility(unreadNotifsIcon, true);
    } else {
        notifsAdapter.add(count + " " + getString(R.string.notifications));
        notifsAdapter.add("View All");
        notifsLinks.add(NOTIFS_PAGE_LINK);
        setMenuItemVisibility(unreadNotifsIcon, false);
    }
    notifsAdapter.notifyDataSetChanged();
    swipeRefreshLayout.setEnabled(settings.getBoolean("enablePTR", false));
    viewAdapter.notifyDataSetChanged();
    if (consumeGoToUrlDefinedPost() && !Session.applySavedScroll) {
        contentList.post(new Runnable() {

            @Override
            public void run() {
                contentList.setSelection(goToThisIndex);
            }
        });
    } else if (Session.applySavedScroll) {
        contentList.post(new Runnable() {

            @Override
            public void run() {
                contentList.setSelectionFromTop(Session.savedScrollVal[0], Session.savedScrollVal[1]);
                Session.applySavedScroll = false;
            }
        });
    } else {
        contentList.post(new Runnable() {

            @Override
            public void run() {
                contentList.setSelectionAfterHeaderView();
            }
        });
    }
    if (swipeRefreshLayout.isRefreshing())
        swipeRefreshLayout.setRefreshing(false);
    if (BuildConfig.DEBUG)
        wtl("GRAIO hNR finishing");
}

Example 72

Project: structr-master File: Importer.java View source code

private DOMNode createChildNodes(final Node startNode, final DOMNode parent, final Page page, final boolean removeHashAttribute, final int depth) throws FrameworkException {
    DOMNode rootElement = null;
    Linkable res = null;
    String instructions = null;
    final List<Node> children = startNode.childNodes();
    for (Node node : children) {
        String tag = node.nodeName();
        // clean tag, remove non-word characters except : and #
        if (tag != null) {
            tag = tag.replaceAll("[^a-zA-Z0-9#:.-_]+", "");
        }
        String type = CaseHelper.toUpperCamelCase(tag);
        String comment = null;
        String content = null;
        String id = null;
        StringBuilder classString = new StringBuilder();
        boolean isNewTemplateOrComponent = false;
        if (ArrayUtils.contains(ignoreElementNames, type)) {
            continue;
        }
        if (node instanceof Element) {
            Element el = ((Element) node);
            Set<String> classes = el.classNames();
            for (String cls : classes) {
                classString.append(cls).append(" ");
            }
            id = el.id();
            // do not download files when called from DeployCommand!
            if (!isDeployment) {
                String downloadAddressAttr = (ArrayUtils.contains(srcElements, tag) ? "src" : ArrayUtils.contains(hrefElements, tag) ? "href" : null);
                if (downloadAddressAttr != null && StringUtils.isNotBlank(node.attr(downloadAddressAttr))) {
                    String downloadAddress = node.attr(downloadAddressAttr);
                    res = downloadFile(downloadAddress, originalUrl);
                }
            }
            if (removeHashAttribute) {
                // Remove data-structr-hash attribute
                node.removeAttr(DOMNode.dataHashProperty.jsonName());
            }
        }
        // Data and comment nodes: Trim the text and put it into the "content" field without changes
        if (type.equals("#comment")) {
            comment = ((Comment) node).getData();
            tag = "";
            // Don't add content node for whitespace
            if (StringUtils.isBlank(comment)) {
                continue;
            }
            // store for later use
            commentSource.append(comment).append("\n");
            // check if comment contains instructions
            if (commentHandler != null && commentHandler.containsInstructions(comment)) {
                if (instructions != null) {
                    // unhandled instructions from previous iteration => empty content element
                    createEmptyContentNode(page, parent, commentHandler, instructions);
                }
                instructions = comment;
                continue;
            }
        } else if (type.equals("#data")) {
            tag = "";
            content = ((DataNode) node).getWholeData();
            // Don't add content node for whitespace
            if (StringUtils.isBlank(content)) {
                continue;
            }
        } else // Text-only nodes: Trim the text and put it into the "content" field
        {
            if (type.equals("#text")) {
                tag = "";
                if (isDeployment) {
                    content = trimTrailingNewline(((TextNode) node).getWholeText());
                    if (content == null || content.length() == 0) {
                        continue;
                    }
                } else {
                    content = trimTrailingNewline(((TextNode) node).text());
                    if (StringUtils.isBlank(content)) {
                        continue;
                    }
                }
            }
        }
        org.structr.web.entity.dom.DOMNode newNode = null;
        // create node
        if (StringUtils.isBlank(tag)) {
            // create comment or content node
            if (!StringUtils.isBlank(comment)) {
                newNode = (DOMNode) page.createComment(comment);
                newNode.setProperty(org.structr.web.entity.dom.Comment.contentType, "text/html");
            } else {
                newNode = (Content) page.createTextNode(content);
            }
        } else if ("structr:template".equals(tag)) {
            final String src = node.attr("src");
            if (src != null) {
                DOMNode template = null;
                if (DeployCommand.isUuid(src)) {
                    template = (DOMNode) StructrApp.getInstance().nodeQuery(NodeInterface.class).and(GraphObject.id, src).getFirst();
                    if (template == null) {
                        System.out.println("##################################### template with UUID " + src + " not found, this is a known bug");
                    }
                } else {
                    template = Importer.findSharedComponentByName(src);
                    if (template == null) {
                        template = Importer.findTemplateByName(src);
                        if (template == null) {
                            template = createNewTemplateNode(parent, node.childNodes());
                            isNewTemplateOrComponent = true;
                        }
                    }
                }
                if (template != null) {
                    newNode = template;
                    if (template.isSharedComponent()) {
                        newNode = (DOMNode) template.cloneNode(false);
                        newNode.setProperty(DOMNode.sharedComponent, template);
                        newNode.setProperty(DOMNode.ownerDocument, page);
                    } else if (page != null) {
                        newNode.setProperty(DOMNode.ownerDocument, page);
                    }
                } else {
                    logger.warn("Unable to find template or shared component {}, template ignored!", src);
                }
            } else {
                logger.warn("Invalid template definition, missing src attribute!");
            }
        } else if ("structr:component".equals(tag)) {
            final String src = node.attr("src");
            if (src != null) {
                DOMNode component = null;
                if (DeployCommand.isUuid(src)) {
                    component = app.nodeQuery(DOMNode.class).and(GraphObject.id, src).getFirst();
                } else {
                    component = Importer.findSharedComponentByName(src);
                }
                if (component == null) {
                    component = createSharedComponent(node);
                }
                isNewTemplateOrComponent = true;
                if (component != null) {
                    newNode = (DOMNode) component.cloneNode(false);
                    newNode.setProperty(DOMNode.sharedComponent, component);
                    newNode.setProperty(DOMNode.ownerDocument, page);
                } else {
                    logger.warn("Unable to find shared component {} - ignored!", src);
                }
            } else {
                logger.warn("Invalid component definition, missing src attribute!");
            }
        } else {
            newNode = (org.structr.web.entity.dom.DOMElement) page.createElement(tag, true);
            if (newNode == null) {
                newNode = createNewHTMLTemplateNodeForUnsupportedTag(parent, node);
                isNewTemplateOrComponent = true;
            }
        }
        if (newNode != null) {
            // save root element for later use
            if (rootElement == null && !(newNode instanceof org.structr.web.entity.dom.Comment)) {
                rootElement = newNode;
            }
            newNode.setProperty(AbstractNode.visibleToPublicUsers, publicVisible);
            newNode.setProperty(AbstractNode.visibleToAuthenticatedUsers, authVisible);
            if (res != null) {
                newNode.setProperty(LinkSource.linkable, res);
            }
            // "id" attribute: Put it into the "_html_id" field
            if (StringUtils.isNotBlank(id)) {
                newNode.setProperty(DOMElement._id, id);
            }
            if (StringUtils.isNotBlank(classString.toString())) {
                newNode.setProperty(DOMElement._class, StringUtils.trim(classString.toString()));
            }
            for (Attribute nodeAttr : node.attributes()) {
                final String key = nodeAttr.getKey();
                if (// Don't add text attribute as _html_text because the text is already contained in the 'content' attribute
                !key.equals("text")) {
                    final String value = nodeAttr.getValue();
                    if (key.startsWith("data-")) {
                        if (// convert data-structr-meta-* attributes to local camel case properties on the node,
                        key.startsWith(DATA_META_PREFIX)) {
                            int l = DATA_META_PREFIX.length();
                            String upperCaseKey = WordUtils.capitalize(key.substring(l), new char[] { '-' }).replaceAll("-", "");
                            String camelCaseKey = key.substring(l, l + 1).concat(upperCaseKey.substring(1));
                            if (value != null) {
                                // store value using actual input converter
                                final PropertyKey actualKey = StructrApp.getConfiguration().getPropertyKeyForJSONName(newNode.getClass(), camelCaseKey, false);
                                if (actualKey != null) {
                                    final PropertyConverter converter = actualKey.inputConverter(securityContext);
                                    if (converter != null) {
                                        final Object convertedValue = converter.convert(value);
                                        newNode.setProperty(actualKey, convertedValue);
                                    } else {
                                        newNode.setProperty(actualKey, value);
                                    }
                                } else {
                                    logger.warn("Unknown meta property key {}, ignoring.", camelCaseKey);
                                }
                            }
                        } else if (// don't convert data-structr-* attributes as they are internal
                        key.startsWith(DATA_STRUCTR_PREFIX)) {
                            final PropertyKey propertyKey = config.getPropertyKeyForJSONName(newNode.getClass(), key);
                            if (propertyKey != null) {
                                final PropertyConverter inputConverter = propertyKey.inputConverter(securityContext);
                                if (value != null && inputConverter != null) {
                                    newNode.setProperty(propertyKey, propertyKey.inputConverter(securityContext).convert(value));
                                } else {
                                    newNode.setProperty(propertyKey, value);
                                }
                            }
                        } else {
                            // store data-* attributes in node
                            final PropertyKey propertyKey = new StringProperty(key);
                            if (value != null) {
                                newNode.setProperty(propertyKey, value);
                            }
                        }
                    } else {
                        boolean notBlank = StringUtils.isNotBlank(value);
                        boolean isAnchor = notBlank && value.startsWith("#");
                        boolean isLocal = notBlank && !value.startsWith("http");
                        boolean isActive = notBlank && value.contains("${");
                        boolean isStructrLib = notBlank && value.startsWith("/structr/js/");
                        if ("link".equals(tag) && "href".equals(key) && isLocal && !isActive && !isDeployment) {
                            newNode.setProperty(new StringProperty(PropertyView.Html.concat(key)), "${link.path}?${link.version}");
                        } else if (("href".equals(key) || "src".equals(key)) && isLocal && !isActive && !isAnchor && !isStructrLib && !isDeployment) {
                            newNode.setProperty(new StringProperty(PropertyView.Html.concat(key)), "${link.path}");
                        } else {
                            newNode.setProperty(new StringProperty(PropertyView.Html.concat(key)), value);
                        }
                    }
                }
            }
            final StringProperty typeKey = new StringProperty(PropertyView.Html.concat("type"));
            if ("script".equals(tag)) {
                final String contentType = newNode.getProperty(typeKey);
                if (contentType == null) {
                    // Set default type of script tag to "text/javascript" to ensure inline JS gets imported properly
                    newNode.setProperty(typeKey, "text/javascript");
                } else if (contentType.equals("application/schema+json")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        // Import schema JSON
                        SchemaJsonImporter.importSchemaJson(source);
                    }
                } else if (contentType.equals("application/x-cypher")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        // import Cypher queries from script source
                        final GraphGistImporter importer = app.command(GraphGistImporter.class);
                        final List<String> sources = new ArrayList<>();
                        sources.add(source);
                        importer.importCypher(sources);
                    }
                    continue;
                } else if (contentType.equals("application/x-structr-script")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        Actions.execute(securityContext, null, source, null);
                    }
                    continue;
                } else if (contentType.equals("application/x-structr-javascript")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        Actions.execute(securityContext, null, source, null);
                    }
                    continue;
                }
            }
            if (instructions != null) {
                if (instructions.contains("@structr:content") && !(newNode instanceof Content)) {
                    // unhandled instructions from previous iteration => empty content element
                    createEmptyContentNode(page, parent, commentHandler, instructions);
                } else {
                    // apply instructions to new DOM element
                    if (commentHandler != null) {
                        commentHandler.handleComment(page, newNode, instructions, true);
                    }
                }
                instructions = null;
            }
            // allow parent to be null to prevent direct child relationship
            if (parent != null) {
                // special handling for <head> elements
                if (newNode instanceof Head && parent instanceof Body) {
                    final org.w3c.dom.Node html = parent.getParentNode();
                    html.insertBefore(newNode, parent);
                } else {
                    parent.appendChild(newNode);
                }
            }
            // Step down and process child nodes except for newly created templates
            if (!isNewTemplateOrComponent) {
                createChildNodes(node, newNode, page, removeHashAttribute, depth + 1);
            }
        }
    }
    // reset instructions when leaving a level
    if (instructions != null) {
        createEmptyContentNode(page, parent, commentHandler, instructions);
        instructions = null;
    }
    return rootElement;
}

Example 73

Project: Diary.Ru-Client-master File: NetworkService.java View source code

/**
     * Ð¤ÑƒÐ½ÐºÑ†Ð¸Ñ? Ð´Ð»Ñ? Ð¿Ñ€Ð¸Ð¼ÐµÐ½ÐµÐ½Ð¸Ñ? Ð¼Ð¾Ð´Ð¸Ñ„Ð¸ÐºÐ°Ñ†Ð¸Ð¹ ÐºÐ¾ Ð²Ñ?ÐµÐ¼ Ð·Ð°Ð³Ñ€ÑƒÐ¶Ð°ÐµÐ¼Ñ‹Ð¼ Ñ?Ñ‚Ñ€Ð°Ð½Ð¸Ñ†Ð°Ð¼ Ð´Ð½ÐµÐ²Ð½Ð¸ÐºÐ¾Ð²
     * Ð¡ÑŽÐ´Ð° Ð²Ð½Ð¾Ñ?Ñ?Ñ‚Ñ?Ñ? Ð¿Ñ€Ð°Ð²ÐºÐ¸ Ñ?Ñ‚Ñ€Ð°Ð½Ð¸Ñ† Ð¿Ð¾ Ð¿Ñ€Ð¾Ñ?ÑŒÐ±Ð°Ð¼ Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»ÐµÐ¹
     * @param resultPage Ñ?Ñ‚Ñ€Ð°Ð½Ð¸Ñ†Ð°, ÐºÐ¾Ñ‚Ð¾Ñ€ÑƒÑŽ Ð½ÑƒÐ¶Ð½Ð¾ Ð¼Ð¾Ð´Ð¸Ñ„Ð¸Ñ†Ð¸Ñ€Ð¾Ð²Ð°Ñ‚ÑŒ
     */
private void mutateContent(Document resultPage) {
    // Ñ?Ñ‚Ñ€Ð°Ð½Ð¸Ñ†Ð° Ð±ÑƒÐ´ÐµÑ‚ Ð¸Ð¼ÐµÑ‚ÑŒ Ð½Ð°Ñˆ Ñ?Ñ‚Ð¸Ð»ÑŒ
    String theme = mPreferences.getString("app.theme", "red");
    resultPage.head().append("<link rel=\"stylesheet\" href=\"file:///android_asset/css/" + theme + ".css\" type=\"text/css\" media=\"all\" title=\"Ð¡Ñ‚Ð°Ð½Ð´Ð°Ñ€Ñ‚\"/>");
    // ÐºÐ½Ð¾Ð¿ÐºÐ° Ñ€ÐµÐ¿Ð¾Ñ?Ñ‚Ð° ÑƒÐºÐ°Ð·Ñ‹Ð²Ð°ÐµÑ‚ Ð½Ð° Ð½ÑƒÐ¶Ð½ÑƒÑŽ Ñ?Ñ?Ñ‹Ð»ÐºÑƒ
    Elements shareLinks = resultPage.select(".postLinks li[class^=quote]");
    for (Element shareLi : shareLinks) {
        if (shareLi.childNodeSize() == 0)
            continue;
        Element repostLink = shareLi.child(0);
        Element diaryRepost = shareLi.select("div a[href*=newpost]").first();
        if (diaryRepost != null)
            repostLink.attr("href", diaryRepost.attr("href"));
    }
    // Ñ‚ÐµÐºÑ?Ñ‚ Ð²Ð¼ÐµÑ?Ñ‚Ð¾ ÐºÐ½Ð¾Ð¿Ð¾Ðº Ð¿Ñ€Ð°Ð²ÐºÐ¸
    if (mUseTextInsteadOfImages) {
        Elements postActionImages = resultPage.select("ul.postActionLinks img");
        for (Element img : postActionImages) {
            // Ð¿ÐµÑ€ÐµÐ´ÐµÐ»Ñ‹Ð²Ð°ÐµÐ¼ Ð½Ð° Ñ‚ÐµÐºÑ?Ñ‚
            if (img.hasAttr("title")) {
                Node text = new TextNode(img.attr("title"), resultPage.baseUri());
                img.replaceWith(text);
            }
        }
    }
    // Ð¿Ñ€Ð°Ð²ÐºÐ° JS
    Elements jsElems = resultPage.getElementsByAttribute("onclick");
    for (Element js : jsElems) {
        String link = js.attr("href");
        if (!link.contains("#more") && !link.contains("subscribe") && !link.contains("showresult") && !link.contains("up&signature=") && !link.contains("down&signature=") && !link.contains("tag_showedit"))
            // Ð£Ð±Ð¸Ð²Ð°ÐµÐ¼ Ð²ÐµÑ?ÑŒ Ñ?Ð²Ð°Ñ?ÐºÑ€Ð¸Ð¿Ñ‚ ÐºÑ€Ð¾Ð¼Ðµ MORE, Ð¿Ð¾Ð´Ð½Ñ?Ñ‚Ð¸Ñ?/Ð¾Ð¿ÑƒÑ?ÐºÐ°Ð½Ð¸Ñ? Ð¿Ð¾Ñ?Ñ‚Ð¾Ð², Ñ€ÐµÐ·ÑƒÐ»ÑŒÑ‚Ð°Ñ‚Ð¾Ð² Ð³Ð¾Ð»Ð¾Ñ?Ð¾Ð²Ð°Ð½Ð¸Ñ? Ð¸ Ð¿Ð¾Ð´Ð¿Ð¸Ñ?ÐºÐ¸
            js.removeAttr("onclick");
    }
    // Ñ?Ð¼ÐµÐ½Ð° ÐºÐ°Ñ€Ñ‚Ð¸Ð½Ð¾Ðº, ÐµÑ?Ð»Ð¸ Ð°Ð²Ñ‚Ð¾Ð·Ð°Ð³Ñ€ÑƒÐ·ÐºÐ° Ð²Ñ‹ÐºÐ»ÑŽÑ‡ÐµÐ½Ð°
    if (!mLoadImages) {
        Elements images = resultPage.select("img[src^=http], a:has(img)");
        for (Element current : images) {
            if (current.tagName().equals("img")) {
                String src = current.attr("src");
                if (!src.contains("diary.ru") && !current.parent().className().equals("avatar") && !src.startsWith("/")) {
                    // Ð²Ñ?Ðµ Ð½ÐµÐ¿Ð¾Ð´Ñ…Ð¾Ð´Ñ?Ñ‰Ð¸Ðµ Ð¿Ð¾Ð´ ÐºÑ€Ð¸Ñ‚ÐµÑ€Ð¸Ð¸ Ð¸Ð·Ð¾Ð±Ñ€Ð°Ð¶ÐµÐ½Ð¸Ñ? Ð½Ð° Ñ?Ñ‚Ñ€Ð°Ð½Ð¸Ñ†Ðµ Ð±ÑƒÐ´ÑƒÑ‚ Ð·Ð°Ð¼ÐµÐ½ÐµÐ½Ñ‹ Ð½Ð° ÐºÐ½Ð¾Ð¿ÐºÐ¸, Ð¿Ð¾ ÐºÐ»Ð¸ÐºÑƒ Ð½Ð° ÐºÐ¾Ñ‚Ð¾Ñ€Ñ‹Ðµ Ð¸ Ð±ÑƒÐ´ÑƒÑ‚ Ð¾Ñ‚ÐºÑ€Ñ‹Ð²Ð°Ñ‚ÑŒÑ?Ñ?
                    String jsButton = "<input type='image' src='file:///android_asset/images/load_image.png' onclick='return handleIMGDown(this, \"" + src + "\")' />";
                    current.after(jsButton);
                    current.remove();
                }
            }
            if (current.tagName().equals("a")) {
                String src = current.getElementsByTag("img").attr("src");
                if (!src.contains("diary.ru") && !current.parent().className().equals("avatar") && !src.startsWith("/")) {
                    // Ð²Ñ?Ðµ Ð½ÐµÐ¿Ð¾Ð´Ñ…Ð¾Ð´Ñ?Ñ‰Ð¸Ðµ Ð¿Ð¾Ð´ ÐºÑ€Ð¸Ñ‚ÐµÑ€Ð¸Ð¸ Ð¸Ð·Ð¾Ð±Ñ€Ð°Ð¶ÐµÐ½Ð¸Ñ? Ð½Ð° Ñ?Ñ‚Ñ€Ð°Ð½Ð¸Ñ†Ðµ Ð±ÑƒÐ´ÑƒÑ‚ Ð·Ð°Ð¼ÐµÐ½ÐµÐ½Ñ‹ Ð½Ð° ÐºÐ½Ð¾Ð¿ÐºÐ¸, Ð¿Ð¾ ÐºÐ»Ð¸ÐºÑƒ Ð½Ð° ÐºÐ¾Ñ‚Ð¾Ñ€Ñ‹Ðµ Ð¸ Ð±ÑƒÐ´ÑƒÑ‚ Ð¾Ñ‚ÐºÑ€Ñ‹Ð²Ð°Ñ‚ÑŒÑ?Ñ?
                    String jsButton = "<input type='image' src='file:///android_asset/images/load_image.png' onclick='return handleADown(this, \"" + current.attr("href") + "\", \"" + src + "\")' />";
                    current.after(jsButton);
                    current.remove();
                }
            }
        }
    }
    // Ð²ÐºÐ»ÑŽÑ‡Ð°ÐµÐ¼ Ð´Ð¶Ð°Ð²Ð°Ñ?ÐºÑ€Ð¸Ð¿Ñ‚
    resultPage.body().append(Utils.javascriptContent);
    // Ñ?Ð¸Ð³Ð½Ð°Ñ‚ÑƒÑ€Ð° Ð´Ð¾Ð»Ð¶Ð½Ð° Ð±Ñ‹Ñ‚ÑŒ Ð²Ð¸Ð´Ð½Ð° Ð¼ÐµÑ‚Ð¾Ð´Ð°Ð¼ JS
    resultPage.body().append("<script>var signature = '" + UserData.getInstance().getSignature() + "';</script>");
}

Example 74

Project: hn-android-master File: BaseHTMLParser.java View source code

public static String getFirstTextValueInElementChildren(Element element) {
    if (element == null)
        return "";
    for (org.jsoup.nodes.Node node : element.childNodes()) if (node instanceof TextNode)
        return ((TextNode) node).text();
    return "";
}

Example 75

Project: webmagic-master File: CssSelector.java View source code

protected String getText(Element element) {
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            accum.append(textNode.text());
        }
    }
    return accum.toString();
}

Example 76

Project: karma-exchange-master File: HtmlUtil.java View source code

// hit when the node is first seen
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        // TextNodes carry all user-readable text in the DOM.
        append(((TextNode) node).text());
    else if (name.equals("li"))
        append("\n * ");
}

Example 77

Project: nate-master File: JsoupBackedNateDocumentFragment.java View source code

@Override
public void setTextContent(String text) {
    this.pseudoRoot = createPseudoRootElement();
    this.pseudoRoot.appendChild(new TextNode(text, JsoupBackedNateDocumentFactory.BASE_URI));
}

Example 78

Project: opensearchserver-master File: JSoupHtmlNode.java View source code

@Override
public String getText() {
    TextNode textNode = (TextNode) node;
    return textNode.text();
}

Example 79

Project: FitGoodies-master File: FitCell.java View source code

public void info(String message) {
    if (message == null) {
        return;
    }
    rawInfo(new TextNode(message, td.baseUri()).outerHtml());
}

Example 80

Project: StartupNews-master File: BaseHTMLParser.java View source code

public static String getFirstTextValueInElementChildren(Element element) {
    if (element == null) {
        return "";
    }
    for (org.jsoup.nodes.Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            return ((TextNode) node).text();
        }
    }
    return "";
}

Example 81

Project: FudanBBS-master File: HtmlToPlainText.java View source code

// hit when the node is first seen
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        // TextNodes carry all user-readable text in the DOM.
        append(((TextNode) node).text());
    else if (name.equals("li"))
        append("\n * ");
}

Example 82

Project: jenkinsmobi-api-master File: GoogleSsoHandler.java View source code

private String getDivText(final Element errorDiv) {
    for (final Node child : errorDiv.childNodes()) {
        if (child instanceof TextNode) {
            return ((TextNode) child).getWholeText().trim();
        }
    }
    return "";
}

Example 83

Project: tika-wrapper-master File: HtmlToPlaintTextSimple.java View source code

// hit when the node is first seen
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        // TextNodes carry all user-readable text in the DOM.
        append(((TextNode) node).text());
    else if (name.equals("li"))
        append("\n * ");
}

Example 84

Project: validadorAcessibilidade-master File: HtmlToPlainText.java View source code

// hit when the node is first seen
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        // TextNodes carry all user-readable text in the DOM.
        append(((TextNode) node).text());
    else if (name.equals("li"))
        append("\n * ");
}

Example 85

Project: zafu_jwc-master File: HtmlToPlainText.java View source code

// hit when the node is first seen
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        // TextNodes carry all user-readable text in the DOM.
        append(((TextNode) node).text());
    else if (name.equals("li"))
        append("\n * ");
}

Example 86

Project: act-master File: PatentDocument.java View source code

@Override
public void head(org.jsoup.nodes.Node node, int i) {
    // This borrows a page from HtmlToPlainText's book.
    if (node instanceof TextNode) {
        String text = ((TextNode) node).text();
        if (text != null && text.length() > 0) {
            segmentBuilder.append(((TextNode) node).text());
        }
    }
}