Java Examples for org.htmlcleaner.TagNode

The following java examples will help you to understand the usage of org.htmlcleaner.TagNode. These source code samples are taken from different open source projects.

Example 1

Project: pair-java-master File: HtmlParserUtil.java View source code

public static Document getHtmlDocumentModel(String htmlContent) {
    try {
        TagNode tagNode = new HtmlCleaner().clean(htmlContent);
        Document doc;
        try {
            doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
        } catch (ParserConfigurationException e) {
            throw new RuntimeException(e);
        }
        return doc;
    } catch (RuntimeException rte) {
        return null;
    }
}

Example 2

Project: dungproxy-master File: XmlModeFetcher.java View source code

public List<String> fetch(String html) {
    List<String> container = new ArrayList<String>();
    TagNode tagNodeRoot = new HtmlCleaner().clean(html);
    if (tagNodeRoot == null) {
        return null;
    }
    NodeData nodDataRoot = new NodeData(null, "page");
    innerFetch(container, doc.getRootElement(), nodDataRoot, nodDataRoot, tagNodeRoot, null);
    return container;
}

Example 3

Project: FastHub-master File: TableHandler.java View source code

private void readNode(Object node, Table table) {
    if (node instanceof TagNode) {
        TagNode tagNode = (TagNode) node;
        if (tagNode.getName().equals("td") || tagNode.getName().equals("th")) {
            Spanned result = this.getSpanner().fromTagNode(tagNode);
            table.addCell(result);
            return;
        }
        if (tagNode.getName().equals("tr")) {
            table.addRow();
        }
        for (Object child : tagNode.getChildTags()) {
            readNode(child, table);
        }
    }
}

Example 4

Project: HtmlSpanner-master File: CSSCompiler.java View source code

@Override
public boolean matches(TagNode tagNode) {
    if (tagNode == null) {
        return false;
    }
    //If a tag name is given it should match
    if (tagName != null && tagName.length() > 0 && !tagName.equals(tagNode.getName())) {
        return false;
    }
    String classAttribute = tagNode.getAttributeByName("class");
    return classAttribute != null && classAttribute.equals(className);
}

Example 5

Project: PageTurner-master File: BookView.java View source code

@TargetApi(Build.VERSION_CODES.FROYO)
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end, SpanStack span) {
    String src = node.getAttributeByName("src");
    if (src == null) {
        src = node.getAttributeByName("href");
    }
    if (src == null) {
        src = node.getAttributeByName("xlink:href");
    }
    if (src == null) {
        return;
    }
    builder.append("");
    if (src.startsWith("data:image")) {
        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.FROYO) {
            try {
                String dataString = src.substring(src.indexOf(',') + 1);
                byte[] binData = Base64.decode(dataString, Base64.DEFAULT);
                setImageSpan(builder, new BitmapDrawable(getContext().getResources(), BitmapFactory.decodeByteArray(binData, 0, binData.length)), start, builder.length());
            } catch (OutOfMemoryErrorIllegalArgumentException |  ia) {
            }
        }
    } else if (spine != null) {
        String resolvedHref = spine.resolveHref(src);
        if (textLoader.hasCachedImage(resolvedHref) && !fakeImages) {
            Drawable drawable = textLoader.getCachedImage(resolvedHref);
            setImageSpan(builder, drawable, start, builder.length());
            LOG.debug("Got cached href: " + resolvedHref);
        } else {
            LOG.debug("Loading href: " + resolvedHref);
            this.registerCallback(resolvedHref, new ImageCallback(resolvedHref, builder, start, builder.length(), fakeImages));
        }
    }
}

Example 6

Project: confluence2wordpress-master File: DefaultConverter.java View source code

public String convert(ContentEntityObject page, ConverterOptions options) throws ConversionException {
    String originalTitle = page.getTitle();
    try {
        //temporarily replace page title to get correct anchors
        //(I know it's ugly)
        page.setTitle(options.getPageTitle());
        String storage = page.getBodyAsString();
        PageContext pageContext = page.toPageContext();
        DefaultConversionContext conversionContext = new DefaultConversionContext(pageContext);
        //storage pre-processing
        List<PreProcessor> preProcessors = getPreProcessors(options, conversionContext);
        for (PreProcessor preProcessor : preProcessors) {
            storage = preProcessor.preProcess(storage, options, pageContext);
        }
        //wiki -> html conversion
        String view = renderer.render(storage, conversionContext);
        handleConversionErrors(view);
        //HTML cleanup
        HtmlCleaner cleaner = getHtmlCleaner(options);
        TagNode root = cleaner.clean(view);
        TagNode body = root.findElementByName("body", false);
        //DOM traversal
        List<TagNodeVisitor> visitors = getTagNodeVisitors(options, page);
        for (TagNodeVisitor visitor : visitors) {
            body.traverse(visitor);
        }
        //serialization
        String html = serialize(body, cleaner.getProperties(), options);
        //HTML post-processing
        List<PostProcessor> postProcessors = getPostProcessors(options);
        for (PostProcessor postProcessor : postProcessors) {
            html = postProcessor.postProcess(html, body, options);
        }
        return html;
    } finally {
        page.setTitle(originalTitle);
    }
}

Example 7

Project: en-webmagic-master File: Xpath2Selector.java View source code

@Override
public String select(String text) {
    try {
        HtmlCleaner htmlCleaner = new HtmlCleaner();
        TagNode tagNode = htmlCleaner.clean(text);
        Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
        Object result;
        try {
            result = xPathExpression.evaluate(document, XPathConstants.NODESET);
        } catch (XPathExpressionException e) {
            result = xPathExpression.evaluate(document, XPathConstants.STRING);
        }
        if (result instanceof NodeList) {
            NodeList nodeList = (NodeList) result;
            if (nodeList.getLength() == 0) {
                return null;
            }
            Node item = nodeList.item(0);
            if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
                return item.getTextContent();
            } else {
                StreamResult xmlOutput = new StreamResult(new StringWriter());
                Transformer transformer = TransformerFactory.newInstance().newTransformer();
                transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
                transformer.transform(new DOMSource(item), xmlOutput);
                return xmlOutput.getWriter().toString();
            }
        }
        return result.toString();
    } catch (Exception e) {
        logger.error("select text error! " + xpathStr, e);
    }
    return null;
}

Example 8

Project: epublib-master File: HHCParser.java View source code

public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException, XPathExpressionException {
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    CleanerProperties props = htmlCleaner.getProperties();
    TagNode node = htmlCleaner.clean(hhcFile);
    Document hhcDocument = new DomSerializer(props).createDOM(node);
    XPath xpath = XPathFactory.newInstance().newXPath();
    Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument.getDocumentElement(), XPathConstants.NODE);
    List<TOCReference> sections = processUlNode(ulNode, resources);
    return sections;
}

Example 9

Project: jeboorker-master File: HTMLMetadataReader.java View source code

/**
	 * Extracts the meta data from the given <code>content</code>.
	 * @param content The html content containing some meta data.
	 * @param bodyIndex The index of the body tag.
	 * @return The extracted meta data. Never returns <code>null</code>.
	 * @throws IOException
	 */
private List<MetadataProperty> extractMetadata(final String content) throws IOException {
    final List<MetadataProperty> result = new ArrayList<>();
    final HtmlCleaner cleaner = new HtmlCleaner();
    final TagNode rootNode = cleaner.clean(new StringReader(content));
    //add meta tags
    final TagNode[] metaElements = rootNode.getElementsByName("meta", true);
    for (int i = 0; i < metaElements.length; i++) {
        String metaName = metaElements[i].getAttributeByName("name");
        String metaContent = metaElements[i].getAttributeByName("content");
        if (metaName == null) {
            Map<String, String> attributes = metaElements[i].getAttributes();
            for (String att : attributes.values()) {
                if (att != null && !att.equals(metaContent)) {
                    metaName = att;
                }
            }
        }
        result.add(new MetadataProperty(metaName, metaContent));
    }
    //add title tag
    final TagNode[] titleElements = rootNode.getElementsByName("title", true);
    for (int i = 0; i < titleElements.length; i++) {
        StringBuffer text = titleElements[i].getText();
        result.add(new MetadataProperty(COMMON_METADATA_TYPES.TITLE.getName(), text));
    }
    return result;
}

Example 10

Project: Reforger-master File: Item.java View source code

public synchronized void parse() {
    if (!_parsed) {
        String[] pair;
        String[] elements;
        String attribute;
        URL url = null;
        StringBuilder wowhead = new StringBuilder("http://www.wowhead.com/");
        TagNode ref = null;
        //<editor-fold defaultstate="collapsed" desc="Parse name.">
        ref = _data.findElementByAttValue("class", "name-shadow", true, true);
        assert ref != null && ref.getText() != null : "Error: unable to determine item name.";
        _name = (ref != null) ? StringEscapeUtils.unescapeHtml4(ref.getText().toString()) : "";
        //</editor-fold>
        //<editor-fold defaultstate="collapsed" desc="Parse item ID.">
        ref = _data.findElementByName("a", false);
        assert ref != null : "Error: unable to determine item attributes.";
        attribute = ref.getAttributeByName("href");
        elements = attribute.split("/wow/en/item/");
        assert elements.length == 2 : "Error: unexpected Armory data format.";
        wowhead.append("item=").append(elements[1]);
        //</editor-fold>
        //<editor-fold defaultstate="collapsed" desc="Extract data-item string.">
        ref = _data.findElementByName("a", false);
        assert ref != null : "Error: unable to determine item attributes.";
        attribute = ref.getAttributeByName("data-item");
        elements = StringEscapeUtils.unescapeHtml4((attribute != null) ? attribute : "").split("&");
        //<editor-fold defaultstate="collapsed" desc="Parse Armory data-item attributes.">
        for (String e : elements) {
            pair = e.split("=");
            if ("e".equals(pair[0])) {
                // Permanent Enchantment
                wowhead.append("&ench=").append(pair[1]);
            }
            if ("re".equals(pair[0])) {
                // Reforge ID (not currently supported by Wowhead)
                wowhead.append("&rf=").append(pair[1]);
            }
            if ("es".equals(pair[0])) {
                // Additional Socket
                wowhead.append("&sock");
            }
            if ("r".equals(pair[0])) {
                // Random Itemization
                wowhead.append("&rand=").append(pair[1]);
            }
            if ("set".equals(pair[0])) {
                // Set Pieces Equipped
                wowhead.append("&pcs=").append(pair[1].replace(',', ':'));
            }
        }
        //</editor-fold>
        //<editor-fold defaultstate="collapsed" desc="Parse Armory gem ID's.">
        TagNode[] gems = _data.getElementsByAttValue("class", "gem", true, true);
        final int GEM_COUNT = gems.length;
        if (GEM_COUNT != 0) {
            String suffix;
            wowhead.append("&gems=");
            for (int i = 0; i < GEM_COUNT; ++i) {
                suffix = gems[i].getAttributeByName("href").replace("/wow/en/item/", "");
                wowhead.append(suffix);
                if (i + 1 < GEM_COUNT) {
                    wowhead.append(":");
                }
            }
        }
        //</editor-fold>
        wowhead.append("&power");
        System.out.println("  " + _name);
        //<editor-fold defaultstate="collapsed" desc="Download and parse Wowhead JSON data.">
        try {
            url = new URL(wowhead.toString());
        } catch (Exception e) {
            Logger.getLogger(Item.class.getSimpleName()).log(Level.SEVERE, null, e);
        }
        Pattern p = Pattern.compile(TOOLTIP_FORMAT);
        Matcher m = p.matcher(URLRetriever.fetchContents(url));
        String itemPayload = (m.find()) ? m.group(1) : "";
        HtmlCleaner parser = new HtmlCleaner();
        CleanerTransformations transform = new CleanerTransformations();
        TagTransformation strip = new TagTransformation("small");
        transform.addTransformation(strip);
        parser.setTransformations(transform);
        TagNode root = parser.clean(itemPayload);
        //</editor-fold>
        root.traverse(this);
        System.out.println();
        _parsed = true;
    }
}

Example 11

Project: WebGatherer---Scraper-and-Analyzer-master File: HtmlParserImpl.java View source code

public Map<String, String> extractLinks(String baseUrl, String htmlPage) {
    TagNode node = htmlCleaner.clean(htmlPage);
    TagNode[] nodesHref = node.getElementsByName("a", true);
    Map<String, String> urlList = new HashMap<String, String>();
    for (TagNode curNode : nodesHref) {
        Map<String, String> attributes = curNode.getAttributes();
        if (attributes.containsKey("href")) {
            String url = curNode.getAttributeByName("href").trim();
            url = getRelativeLink(url, baseUrl);
            urlList.put(curNode.getText().toString().toLowerCase().trim(), url);
        }
    }
    return urlList;
}

Example 12

Project: agile-stock-master File: EtnetQuoteFetcher.java View source code

@Override
public StockDetail fetch(String quote) {
    StockDetail detail = new StockDetail();
    String url = getUrl(quote);
    HttpGet req = new HttpGet(url);
    try {
        detail.setQuote(quote);
        detail.setSourceUrl(url);
        // download html
        HttpResponse resp = getClient().execute(req);
        String html = EntityUtils.toString(resp.getEntity());
        // optimization to reduce html size
        int start = html.indexOf("<!-- Content -->");
        int end = html.indexOf("top:-1000px;\">");
        html = StringUtils.substring(html, start, end);
        TagNode document = getCleaner().clean(html);
        resp = null;
        // set updatedAt
        SimpleDateFormat formatter = new SimpleDateFormat(DATE_FORMAT);
        String updatedAtStr = getFirstMatchedElementContent(document, XPATH_UPDATE);
        Date updatedDate = formatter.parse(updatedAtStr);
        Calendar updatedAt = Calendar.getInstance();
        updatedAt.setTime(updatedDate);
        detail.setUpdatedAt(updatedAt);
        TagNode table = getFirstMatchedElement(document, XPATH_BASE);
        // set price
        String pricesStr = getFirstMatchedElementContent(table, XPATH_PRICE);
        BigDecimal price = new BigDecimal(pricesStr);
        detail.setPrice(price);
        // set price change and change %
        String priceChangesStr = getFirstMatchedElementContent(table, XPATH_PRICE_CHANGE);
        Matcher priceChangeMatcher = PATTERN_PRICE_CHANGE.matcher(priceChangesStr);
        if (priceChangeMatcher.find()) {
            String priceChangeNumStr = priceChangeMatcher.group(1);
            BigDecimal priceChangeNum = new BigDecimal(priceChangeNumStr);
            detail.setChangePrice(priceChangeNum);
            String priceChangePercentStr = priceChangeMatcher.group(2);
            BigDecimal priceChangePercent = new BigDecimal(priceChangePercentStr);
            detail.setChangePricePercent(priceChangePercent);
        }
        String dayHighStr = getFirstMatchedElementContent(table, XPATH_DAY_HIGH);
        BigDecimal dayHigh = new BigDecimal(dayHighStr);
        detail.setDayHigh(dayHigh);
        String dayLowStr = getFirstMatchedElementContent(table, XPATH_DAY_LOW);
        BigDecimal dayLow = new BigDecimal(dayLowStr);
        detail.setDayLow(dayLow);
        String volume = getFirstMatchedElementContent(table, XPATH_DAY_VOLUME);
        detail.setVolume(volume);
    } catch (ClientProtocolException e) {
        throw new DownloadException("error fetching stock", e);
    } catch (IOException e) {
        throw new DownloadException("error fetching stock", e);
    } catch (XPatherException e) {
        throw new ParseException("unexpected result while fetch stock", e);
    } catch (java.text.ParseException e) {
        throw new ParseException("date format unparsable", e);
    }
    return detail;
}

Example 13

Project: brezskrbnik-master File: Service2.java View source code

public TagNode xmlCleaner(String url) {
    CleanerProperties props = new CleanerProperties();
    props.setTranslateSpecialEntities(true);
    props.setTransResCharsToNCR(true);
    props.setOmitComments(true);
    TagNode tagNode;
    try {
        tagNode = new HtmlCleaner(props).clean(new URL(url));
        return tagNode;
    } catch (MalformedURLException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

Example 14

Project: MadStore-master File: PreprocessingStage.java View source code

public Page execute(Page page) {
    try {
        LOG.info("Cleaning up page: {}", page.getLink());
        HtmlCleaner htmlCleaner = new HtmlCleaner();
        CleanerProperties cleanerProperties = htmlCleaner.getProperties();
        cleanerProperties.setOmitComments(true);
        cleanerProperties.setTranslateSpecialEntities(false);
        cleanerProperties.setRecognizeUnicodeChars(false);
        cleanerProperties.setOmitUnknownTags(true);
        cleanerProperties.setOmitDoctypeDeclaration(false);
        cleanerProperties.setOmitXmlDeclaration(false);
        cleanerProperties.setUseCdataForScriptAndStyle(true);
        TagNode tagNode = htmlCleaner.clean(page.getData());
        tagNode.removeAttribute("xmlns:xml");
        XmlSerializer xmlSerializer = new CompactXmlSerializer(cleanerProperties);
        String cleanedPage = xmlSerializer.getXmlAsString(tagNode, "UTF-8");
        LOG.debug("Cleaned page: {}", cleanedPage);
        return new Page(page.getLink(), cleanedPage);
    } catch (Exception e) {
        LOG.warn(e.getMessage(), e);
        return null;
    }
}

Example 15

Project: RestFixtureLiveDoc-master File: HtmlSimplifier.java View source code

private String serializeAndSanitiseResult(TagNode tagNode) {
    try {
        String result = serializer.getAsString(tagNode, true);
        result = result.replaceAll(" ", " ");
        result = result.replaceAll("<br />\n\n<br />\n\n", "<br />\n\n");
        result = result.replaceAll("<br />\n\n<br />\n\n", "<br />\n\n");
        result = result.replaceAll("<br />\n\n", "<br />\n");
        result = result.replaceAll("<br />\n<br />\n", "<br />\n");
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

Example 16

Project: EasySOA-Incubation-master File: ScrapingStrategy.java View source code

@Override
public List<FoundService> findFromContext(BrowsingContext context) throws Exception {
    List<FoundService> foundServices = new LinkedList<FoundService>();
    if (context.getData() != null) {
        URL url = context.getURL();
        // Web page parsing
        HtmlCleaner cleaner = new HtmlCleaner();
        TagNode cleanHtml = null;
        try {
            cleanHtml = cleaner.clean(context.getData());
        } catch (StackOverflowError e) {
            log.warn("HtmlCleaner stack overflow while parsing " + url + ", aborting strategy");
            return foundServices;
        }
        // Find app name
        String applicationName = guessApplicationName(context);
        // Find links
        List<String> foundServicesNames = new LinkedList<String>();
        Object[] links = cleanHtml.evaluateXPath("//a");
        changeToAbsolutePath(links, "href", url);
        for (Object o : links) {
            TagNode link = (TagNode) o;
            try {
                String linkHref = link.getAttributeByName("href");
                if (linkHref == null) {
                    // NB. happens in some bad html
                    continue;
                }
                String ref = new URL(url, linkHref).toString();
                String name = (link.getText() != null) ? link.getText().toString() : // TODO else title attr
                ref;
                // Truncate if name is an URL (serviceName cannot contain slashes)
                if (name.contains("/")) {
                    String[] nameParts = name.split("/}");
                    name = nameParts[nameParts.length - 1].replaceAll("(\\?|\\.|\\?wsdl)", // AND NOT 'wsdl' only (see below)
                    "");
                }
                // Append digits to the link name if it already exists
                int i = 1;
                if (ref != null && ref.toLowerCase().endsWith("?wsdl")) {
                    // AND NOT "wsdl" only (see below)
                    while (foundServicesNames.contains(name)) {
                        name = (i == 1 ? name + i++ : name.substring(0, name.length() - 1)) + i++;
                    }
                    name = name.replaceAll("[\n\r]", "").trim();
                    String nameWithoutWsdl = name.replaceAll("([ ]*\\?WSDL|[ ]*\\?wsdl)", "").trim();
                    ///name = name.replaceAll("([\n\r]|[ ]*WSDL|[ ]*wsdl)", "").trim();
                    if (!nameWithoutWsdl.isEmpty()) {
                        // NOT REQUIRED ANYMORE
                        name = nameWithoutWsdl;
                    }
                    foundServices.add(new FoundService(name, ref, applicationName));
                    foundServicesNames.add(name);
                }
            } catch (Exception e) {
            }
        }
    }
    return foundServices;
}

Example 17

Project: EasySOA-master File: ScrapingStrategy.java View source code

@Override
public List<FoundService> findFromContext(BrowsingContext context) throws Exception {
    List<FoundService> foundServices = new LinkedList<FoundService>();
    if (context.getData() != null) {
        URL url = context.getURL();
        // Web page parsing
        HtmlCleaner cleaner = new HtmlCleaner();
        TagNode cleanHtml = null;
        try {
            cleanHtml = cleaner.clean(context.getData());
        } catch (StackOverflowError e) {
            log.warn("HtmlCleaner stack overflow while parsing " + url + ", aborting strategy");
            return foundServices;
        }
        // Find app name
        String applicationName = guessApplicationName(url);
        // Find links
        List<String> foundServicesNames = new LinkedList<String>();
        Object[] links = cleanHtml.evaluateXPath("//a");
        changeToAbsolutePath(links, "href", url);
        for (Object o : links) {
            TagNode link = (TagNode) o;
            try {
                String ref = new URL(url, link.getAttributeByName("href")).toString();
                String name = (link.getText() != null) ? link.getText().toString() : ref;
                // Truncate if name is an URL (serviceName cannot contain slashes)
                if (name.contains("/")) {
                    String[] nameParts = name.split("/}");
                    name = nameParts[nameParts.length - 1].replaceAll("(\\?|\\.|wsdl)", "");
                }
                // Append digits to the link name if it already exists
                int i = 1;
                if (ref != null && ref.toLowerCase().endsWith("wsdl")) {
                    while (foundServicesNames.contains(name)) {
                        name = (i == 1 ? name + i++ : name.substring(0, name.length() - 1)) + i++;
                    }
                    name = name.replaceAll("([\n\r]|[ ]*WSDL|[ ]*wsdl)", "").trim();
                    foundServices.add(new FoundService(name, ref, applicationName));
                    foundServicesNames.add(name);
                }
            } catch (MalformedURLException e) {
            }
        }
    }
    return foundServices;
}

Example 18

Project: email-master File: HtmlSignatureRemover.java View source code

public static String stripSignature(String content) {
    Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
    if (dashSignatureHtml.find()) {
        Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
        Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
        List<Integer> start = new ArrayList<>();
        List<Integer> end = new ArrayList<>();
        while (blockquoteStart.find()) {
            start.add(blockquoteStart.start());
        }
        while (blockquoteEnd.find()) {
            end.add(blockquoteEnd.start());
        }
        if (start.size() != end.size()) {
            Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.", start.size(), end.size());
        } else if (start.size() > 0) {
            // Ignore quoted signatures in blockquotes.
            dashSignatureHtml.region(0, start.get(0));
            if (dashSignatureHtml.find()) {
                // before first <blockquote>.
                content = content.substring(0, dashSignatureHtml.start());
            } else {
                for (int i = 0; i < start.size() - 1; i++) {
                    // within blockquotes.
                    if (end.get(i) < start.get(i + 1)) {
                        dashSignatureHtml.region(end.get(i), start.get(i + 1));
                        if (dashSignatureHtml.find()) {
                            content = content.substring(0, dashSignatureHtml.start());
                            break;
                        }
                    }
                }
                if (end.get(end.size() - 1) < content.length()) {
                    // after last </blockquote>.
                    dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                    if (dashSignatureHtml.find()) {
                        content = content.substring(0, dashSignatureHtml.start());
                    }
                }
            }
        } else {
            // No blockquotes found.
            content = content.substring(0, dashSignatureHtml.start());
        }
    }
    // Fix the stripping off of closing tags if a signature was stripped,
    // as well as clean up the HTML of the quoted message.
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties properties = cleaner.getProperties();
    // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);
    TagNode node = cleaner.clean(content);
    SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
    content = htmlSerialized.getAsString(node, "UTF8");
    return content;
}

Example 19

Project: FivewaysBusTimesAndroid-master File: BusTimeScraper.java View source code

public static List<Bus> getBusesFromURL(String url) throws XPatherException, ParserConfigurationException, SAXException, IOException, XPatherException {
    HtmlCleaner cleaner = new HtmlCleaner();
    URL buses_url = new URL(url);
    URLConnection conn = buses_url.openConnection();
    TagNode node = cleaner.clean(new InputStreamReader(conn.getInputStream()));
    Object[] data_nodes = node.evaluateXPath(BUS_EXPR);
    // take the data in groups of three - if the first of the three is
    // bus number is blank then skip
    List<Bus> busList = new ArrayList<Bus>();
    for (int i = 0; i < data_nodes.length; i += 3) {
        String bus_name = ((TagNode) data_nodes[i]).getText().toString();
        String bus_dest = ((TagNode) data_nodes[i + 1]).getText().toString();
        String bus_time = ((TagNode) data_nodes[i + 2]).getText().toString();
        if (bus_name != "") {
            bus_dest = bus_dest.replace(" ", " ");
            bus_time = bus_time.replace(" ", " ");
            // deal with the time object - this is either a time
            // or a minutes offset, convert the minutes offset to a real
            // bus time can have an appended * for timetabled time
            Calendar arrivetime = Calendar.getInstance();
            if (!bus_time.contains(":")) {
                // in the format 'mm mins' or 'm mins'
                int minutes_offset = Integer.parseInt(bus_time.substring(0, 2).trim());
                arrivetime.add(Calendar.MINUTE, minutes_offset);
            } else {
                int cpoint = bus_time.indexOf(':');
                int hour = Integer.parseInt(bus_time.substring(0, cpoint));
                int minutes = Integer.parseInt(bus_time.substring(cpoint + 1, cpoint + 3));
                Log.v(LOG_TAG, hour + " == " + minutes);
                Calendar timenow = arrivetime;
                arrivetime.set(Calendar.HOUR_OF_DAY, hour);
                arrivetime.set(Calendar.MINUTE, minutes);
                // deal with midnight crossing
                if (arrivetime.before(timenow)) {
                    arrivetime.add(Calendar.HOUR_OF_DAY, 24);
                }
            }
            Bus b = new Bus(bus_name, bus_dest, arrivetime);
            busList.add(b);
            Log.v(LOG_TAG, b.toString());
        }
    }
    return busList;
}

Example 20

Project: k-9-master File: HtmlSignatureRemover.java View source code

public static String stripSignature(String content) {
    Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
    if (dashSignatureHtml.find()) {
        Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
        Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
        List<Integer> start = new ArrayList<>();
        List<Integer> end = new ArrayList<>();
        while (blockquoteStart.find()) {
            start.add(blockquoteStart.start());
        }
        while (blockquoteEnd.find()) {
            end.add(blockquoteEnd.start());
        }
        if (start.size() != end.size()) {
            Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.", start.size(), end.size());
        } else if (start.size() > 0) {
            // Ignore quoted signatures in blockquotes.
            dashSignatureHtml.region(0, start.get(0));
            if (dashSignatureHtml.find()) {
                // before first <blockquote>.
                content = content.substring(0, dashSignatureHtml.start());
            } else {
                for (int i = 0; i < start.size() - 1; i++) {
                    // within blockquotes.
                    if (end.get(i) < start.get(i + 1)) {
                        dashSignatureHtml.region(end.get(i), start.get(i + 1));
                        if (dashSignatureHtml.find()) {
                            content = content.substring(0, dashSignatureHtml.start());
                            break;
                        }
                    }
                }
                if (end.get(end.size() - 1) < content.length()) {
                    // after last </blockquote>.
                    dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                    if (dashSignatureHtml.find()) {
                        content = content.substring(0, dashSignatureHtml.start());
                    }
                }
            }
        } else {
            // No blockquotes found.
            content = content.substring(0, dashSignatureHtml.start());
        }
    }
    // Fix the stripping off of closing tags if a signature was stripped,
    // as well as clean up the HTML of the quoted message.
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties properties = cleaner.getProperties();
    // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);
    TagNode node = cleaner.clean(content);
    SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
    content = htmlSerialized.getAsString(node, "UTF8");
    return content;
}

Example 21

Project: meaningfulweb-master File: ExtractUtils.java View source code

public static void cleanInvalidAttributes(TagNode parent) {
    List nodes = parent.getChildren();
    if (nodes != null) {
        for (int i = 0; i < nodes.size(); i++) {
            Object curChild = nodes.get(i);
            if (curChild instanceof TagNode) {
                TagNode curNode = (TagNode) curChild;
                Map attrMap = curNode.getAttributes();
                Set<String> toRemove = new HashSet<String>();
                for (Object entryObj : attrMap.entrySet()) {
                    Entry entry = (Entry) entryObj;
                    String attrName = (String) entry.getKey();
                    if (!HtmlExtractUtils.isValidAttribute(attrName)) {
                        toRemove.add(attrName);
                    }
                }
                for (String remove : toRemove) {
                    curNode.removeAttribute(remove);
                }
                cleanInvalidAttributes(curNode);
            }
        }
    }
}

Example 22

Project: opensearchserver-master File: HtmlArchiver.java View source code

private final void checkStyleCSS(TagNode node) throws ClientProtocolException, IllegalStateException, IOException, SearchLibException, URISyntaxException {
    if (!("style".equalsIgnoreCase(node.getName())))
        return;
    String attr = node.getAttributeByName("type");
    if (!StringUtils.isEmpty(attr) && !"text/css".equalsIgnoreCase(attr))
        return;
    attr = node.getAttributeByName("media");
    if (!StringUtils.isEmpty(attr) && !"screen".equalsIgnoreCase(attr) && !"all".equalsIgnoreCase(attr))
        return;
    StringBuilder builder = (StringBuilder) node.getText();
    if (builder == null)
        return;
    String content = builder.toString();
    String newContent = StringEscapeUtils.unescapeXml(content);
    StringBuffer sb = checkCSSContent(baseUrl, newContent);
    if (sb != null)
        newContent = sb.toString();
    if (newContent.equals(content))
        return;
    node.removeAllChildren();
    node.addChild(new ContentNode(newContent));
}

Example 23

Project: TL-android-app-master File: TLLib.java View source code

public static boolean login(String login, String pw, Handler handler, Context context) throws IOException {
    handler.sendEmptyMessage(TLHandler.PROGRESS_LOGIN);
    logout();
    // Fetch the token
    HtmlCleaner cleaner = TLLib.buildDefaultHtmlCleaner();
    URL url = new URL(LOGIN_URL);
    //TagNode node = TagNodeFromURLEx2(cleaner, url, handler, context, "<html>", false);
    TagNode node = TLLib.TagNodeFromURLLoginToken(cleaner, url, handler, context);
    String token = null;
    try {
        TagNode result = (TagNode) (node.evaluateXPath("//input")[0]);
        token = result.getAttributeByName("value");
    } catch (XPatherException e1) {
        e1.printStackTrace();
    }
    if (token == null) {
        return false;
    }
    // 
    DefaultHttpClient httpclient = new DefaultHttpClient();
    HttpPost httpost = new HttpPost(LOGIN_URL);
    List<NameValuePair> nvps = new ArrayList<NameValuePair>();
    nvps.add(new BasicNameValuePair(USER_FIELD, login));
    nvps.add(new BasicNameValuePair(PASS_FIELD, pw));
    nvps.add(new BasicNameValuePair(REMEMBERME, "1"));
    nvps.add(new BasicNameValuePair("stage", "1"));
    nvps.add(new BasicNameValuePair("back_url", "/"));
    nvps.add(new BasicNameValuePair("token", token));
    Log.d("token:", token);
    tokenField = token;
    if (cookieStore != null) {
        httpclient.setCookieStore(cookieStore);
    }
    try {
        httpost.setEntity(new UrlEncodedFormEntity(nvps));
        HttpResponse response = httpclient.execute(httpost);
        HttpEntity entity = response.getEntity();
        Header[] headers = response.getHeaders("Set-Cookie");
        if (cookieStore.getCookies().size() < 2) {
            loginName = null;
            loginStatus = false;
        } else {
            loginName = login;
            loginStatus = true;
            cookieStore = httpclient.getCookieStore();
        }
        if (entity != null) {
            entity.consumeContent();
        }
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    }
    return loginStatus;
}

Example 24

Project: zseinfo-master File: VulcanReplacementsHandler.java View source code

private static TagNode downloadReplacements() throws IOException {
    HtmlCleaner cleaner = new HtmlCleaner();
    String url = Configuration.getInstance().getReplacementsConfig().getReplacementsUrl();
    InputStream input;
    HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
    if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) {
        throw new IOException("BÅ‚Ä…d poÅ‚Ä…czenia z serwerem: " + connection.getResponseCode() + " " + connection.getResponseMessage());
    }
    input = connection.getInputStream();
    TagNode node = cleaner.clean(input, Configuration.getInstance().getReplacementsConfig().getEncoding());
    try {
        return node.findElementByName("table", true).findElementByName("tbody", false);
    } catch (NullPointerException e) {
        if (node.findElementByName("frame", true) != null) {
            TagNode frame = node.findElementByName("frame", true);
            String sheetAddr = frame.getAttributeByName("src");
            connection = (HttpURLConnection) new URL(url + sheetAddr).openConnection();
            if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) {
                throw new IOException("BÅ‚Ä…d poÅ‚Ä…czenia z serwerem: " + connection.getResponseCode() + " " + connection.getResponseMessage());
            }
            input = connection.getInputStream();
            node = cleaner.clean(input, Configuration.getInstance().getReplacementsConfig().getEncoding());
            return node;
        }
        throw e;
    }
}

Example 25

Project: AbianReader-master File: AbianReaderItemView.java View source code

public void setTargetRssItem(int itemPosition) {
    m_targetRssItemNumber = itemPosition;
    AbianReaderData abianReaderAppData = AbianReaderApplication.getData();
    AbianReaderItem theItem = abianReaderAppData.getItemNumber(itemPosition);
    if (theItem != null) {
        int nWid = AbianReaderApplication.s_width;
        int nHei = AbianReaderApplication.s_height;
        float thisScale = m_webView.getScale();
        float nScaledWid = (nWid / thisScale);
        float nScaledHei = (nHei / thisScale);
        float nMaxWid = (nScaledWid * 0.9f);
        float nMaxHei = (nScaledHei * 0.9f);
        if (nWid > nHei) {
            nMaxHei = (nScaledHei * 0.75f);
        }
        String maxWidStr = Integer.toString((int) nMaxWid);
        String maxHeiStr = Integer.toString((int) nMaxHei);
        String constraints = "{ ";
        constraints += "max-width: " + maxWidStr + "; ";
        constraints += "max-height: " + maxHeiStr + "; ";
        constraints += "width: auto; ";
        constraints += "height: auto; ";
        constraints += "display: block; ";
        constraints += "margin-left: auto; ";
        constraints += "margin-right: auto; ";
        constraints += "}";
        String ourHeadNode = "<head>";
        // use this to tell webview not to scale the webpage
        // ourHeadNode +=
        // "<meta name=\"viewport\" content=\"target-densitydpi=device-dpi\" />";
        ourHeadNode += "<style>";
        ourHeadNode += "img " + constraints;
        ourHeadNode += "\niframe " + constraints;
        ourHeadNode += "\ndiv " + constraints;
        // ourHeadNode += "\npre " + constraints;
        ourHeadNode += "</style>";
        ourHeadNode += "</head>";
        String ourHeader = "<html>" + ourHeadNode + "<body><h2>" + theItem.getTitle() + "</h2>";
        ourHeader += "<small>By " + theItem.getCreator() + " posted " + theItem.getPubDate() + "</small>";
        if (theItem.getFeaturedImageLink().length() != 0) {
            ourHeader += "<br /><br />";
            ourHeader += "<a href=\"";
            ourHeader += theItem.getFeaturedImageLink();
            ourHeader += "\">";
            ourHeader += "<img src=\"";
            ourHeader += theItem.getFeaturedImageLink();
            ourHeader += "\" /> </a>";
        }
        // ourHeader += "<br />";
        String ourFooter = "<br /><br /></body></html>";
        //String ourHtml = theItem.getContent();
        String ourHtml = ourHeader;
        ourHtml += theItem.getContent();
        ourHtml += ourFooter;
        TagNode theCleanTagNode = m_htmlCleaner.clean(ourHtml);
        TagNode imgNodes[] = theCleanTagNode.getElementsByName("img", true);
        for (int i = 0; i < imgNodes.length; i++) {
            imgNodes[i].removeAttribute("width");
            imgNodes[i].removeAttribute("height");
        }
        TagNode iFrameNodes[] = theCleanTagNode.getElementsByName("iframe", true);
        for (int i = 0; i < iFrameNodes.length; i++) {
            iFrameNodes[i].removeAttribute("width");
            iFrameNodes[i].removeAttribute("height");
        }
        try {
            ourHtml = m_htmlSerializer.getAsString(theCleanTagNode);
        } catch (IOException e) {
            e.printStackTrace();
        }
        //ourHtml = ourHeader + ourHtml;
        //ourHtml += ourFooter;
        m_webView.loadDataWithBaseURL(null, ourHtml, "text/html", "UTF-8", null);
    // m_webView.loadDataWithBaseURL(theItem.getLink(), ourHtml,
    // "text/html", "UTF-8", null);
    } else {
        Log.e(TAG, "TheItem is null");
    }
}

Example 26

Project: PressGangCCMSREST-master File: TopicSourceURLTitleThread.java View source code

protected void setTitle(final TopicSourceUrl topicSourceUrl) {
    try {
        // Some common string replacements to make in the titles
        final Map<String, String> replaceList = new HashMap<String, String>();
        replaceList.put(" ", " ");
        replaceList.put("&", "&");
        // create an instance of HtmlCleaner
        final HtmlCleaner cleaner = new HtmlCleaner();
        // clean the source url
        final TagNode node = cleaner.clean(new URL(topicSourceUrl.getSourceUrl()));
        // find the first title node
        final TagNode title = node.findElementByName("title", true);
        if (title != null) {
            // clean up the title
            String titleText = title.getText().toString();
            for (final String replace : replaceList.keySet()) titleText = titleText.replaceAll(replace, replaceList.get(replace));
            titleText = titleText.trim();
            // assign it to the entity
            topicSourceUrl.setTitle(titleText);
        }
    } catch (final IOException ex) {
        LOGGER.error("Probably a problem with HTMLCleaner", ex);
    }
}

Example 27

Project: webmagic-master File: XpathSelectorTest.java View source code

@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
    System.out.println(html.length());
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(html);
    Document document = Jsoup.parse(html);
    long time = System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis() - time);
    System.out.println("=============");
    time = System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        Jsoup.parse(html);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        document.select("a");
    }
    System.out.println(System.currentTimeMillis() - time);
    System.out.println("=============");
    time = System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis() - time);
    System.out.println("=============");
    XPathEvaluator compile = Xsoup.compile("//a");
    time = System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        compile.evaluate(document);
    }
    System.out.println(System.currentTimeMillis() - time);
}

Example 28

Project: zkBrowser-master File: Search.java View source code

public int getResultsCount() {
    String requestUrl = mRequestUrl;
    try {
        requestUrl += URLEncoder.encode(mTerm, "UTF-8") + "/" + URLEncoder.encode(mLocation, "UTF-8");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    }
    TagNode rootNode = getRootNode(requestUrl);
    int resultsCount = 1;
    Object[] len = evaluateXPath(Utils.XPATH_RESULT_COUNT, rootNode);
    Object[] noRes = evaluateXPath(Utils.XPATH_NO_RESULTS, rootNode);
    if (noRes.length > 0) {
        resultsCount = 0;
    }
    if (len.length > 0) {
        String str = ((StringBuffer) len[0]).toString();
        String[] tempContent = str.split(" ");
        resultsCount = Integer.parseInt(tempContent[3]);
    }
    return resultsCount;
}

Example 29

Project: Ebselen-master File: IDEToEbselen.java View source code

/**
     * Cleans the relevant file and generates a valid XML file ready for processing to Sel 2 java File.
     *
     * @param absoluteFilename - name of the file to convert.
     * @return String - location of the converted file.
     */
public String convertToXML(String absoluteFilename) throws Exception {
    FileHandler fromSelIDE = new FileHandler(absoluteFilename);
    FileHandler toXML = new FileHandler(System.getProperty("java.io.tmpdir") + File.separator + fromSelIDE.getFileName() + ".xml", true);
    if (fromSelIDE.getFile().isDirectory()) {
        LOGGER.error("Cannot convert directory {} into a Selenium Test!", fromSelIDE.getFileName());
        return null;
    }
    //Clean up html so that we can read it as XML properly
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties XMLPrefs = cleaner.getProperties();
    XMLPrefs.setUseEmptyElementTags(true);
    XMLPrefs.setTranslateSpecialEntities(true);
    XMLPrefs.setTransResCharsToNCR(true);
    XMLPrefs.setOmitComments(true);
    XMLPrefs.setOmitComments(true);
    XMLPrefs.setOmitDoctypeDeclaration(true);
    XMLPrefs.setNamespacesAware(false);
    TagNode tagNode = new HtmlCleaner(XMLPrefs).clean(fromSelIDE.getFile());
    new PrettyXmlSerializer(XMLPrefs).writeToStream(tagNode, toXML.getWritableFileOutputStream(), "utf-8");
    toXML.close();
    return toXML.getAbsoluteFile();
}

Example 30

Project: fenixedu-ist-teacher-service-master File: AnnualTeachingCreditsDocumentFilter.java View source code

private String clean(String dirtyHtml) {
    try {
        HtmlCleaner cleaner = new HtmlCleaner();
        TagNode root = cleaner.clean(dirtyHtml);
        return new SimpleHtmlSerializer(cleaner.getProperties()).getAsString(root);
    } catch (IOException e) {
        logger.error(e.getMessage(), e);
    }
    return StringUtils.EMPTY;
}

Example 31

Project: Joomla-Eclipse-master File: JoomlaDeployerImpl.java View source code

@Override
public IStatus install(BasicExtensionModel extension, DeploymentRuntime transientRuntime, IProgressMonitor progressMonitor) {
    try {
        progressMonitor.beginTask("Install " + extension.getName(), 2000);
        final DeploymentRuntime persistentRuntime = ServerUtils.getPersistentDeploymentRuntime(transientRuntime, getDeploymentDescriptor());
        final IJoomlaHttpSession session = persistentRuntime.getHttpSession();
        final String extensionDir = ResourcesPlugin.getWorkspace().getRoot().getFile(extension.getManifestPath()).getLocation().toFile().getParent();
        final TagNode installPage = session.executeAndParseResponseBody(new PrepareInstallationRequest(persistentRuntime), true, new SubProgressMonitor(progressMonitor, 1000));
        final String adminFormAttribute = persistentRuntime.getServer().getMajorVersion() == MajorJoomlaVersion.ONE_SIX ? "id" : "name";
        final List<NameValuePair> installParams = ServerUtils.extractInputNameValuePairs("//form[@" + adminFormAttribute + "='adminForm']//input[@type='hidden']", installPage);
        final Iterator<NameValuePair> i = installParams.iterator();
        while (i.hasNext()) {
            final NameValuePair param = i.next();
            if ("installtype".equals(param.getName())) {
                i.remove();
            }
        }
        installParams.add(new NameValuePair("installtype", "folder"));
        installParams.add(new NameValuePair("install_directory", extensionDir));
        final TagNode result = session.executeAndParseResponseBody(new GenericPostRequest("administrator/index.php?option=com_installer&view=install", persistentRuntime, installParams), true, new SubProgressMonitor(progressMonitor, 1000));
        final JoomlaSystemMessage systemMessage = ServerUtils.extractFirstSystemMessage(result);
        if (systemMessage == null || systemMessage.getSeverity() == MessageSeverity.INFO) {
            newDeployment(extension, persistentRuntime);
            return systemMessage == null ? JoomlaCorePlugin.newStatus(IStatus.WARNING, "No confirmation message. Installation likely failed.") : JoomlaCorePlugin.newStatus(IStatus.OK, systemMessage.getMessage());
        } else {
            // really a warning, because it might be "extension already installed" type of message
            // TODO: possibly try to distinguish between failure & "already installed" - fetch list of extensions
            newDeployment(extension, persistentRuntime);
            return JoomlaCorePlugin.newStatus(IStatus.WARNING, systemMessage.getMessage());
        }
    } catch (final RuntimeException e) {
        final String message = "Unexpected exception while installing extension " + extension.getName();
        JoomlaCorePlugin.logError(message, e);
        return JoomlaCorePlugin.newStatus(IStatus.ERROR, message, e);
    } finally {
        progressMonitor.done();
    }
}

Example 32

Project: PoliSons-master File: News.java View source code

private String extractText(TagNode node, boolean keepHtmlTags) {
    StringBuilder value = new StringBuilder();
    if (node.getChildren().size() > 0) {
        for (int i = 0; i < node.getChildren().size(); i++) {
            if (node.getChildren().get(i).toString().equals("strong")) {
                value.append("<b>" + ((TagNode) node.getChildren().get(i)).getText().toString() + "</b>");
            } else if (node.getChildren().get(i).toString().equals("img")) {
                value.append("<br>");
            } else if (node.getChildren().get(i).toString().equals("br")) {
                value.append("<br>");
            } else if (node.getChildren().get(i).toString().equals("a")) {
                value.append(((TagNode) node.getChildren().get(i)).getText().toString());
            } else if (node.getChildren().get(i).toString().equals("small")) {
                value.append("<small>" + ((TagNode) node.getChildren().get(i)).getText().toString() + "</small>");
            } else if (node.getChildren().get(i).toString() != null) {
                value.append(node.getChildren().get(i).toString());
            }
        }
    }
    // Do you want to keep Html tags
    if (keepHtmlTags) {
        return value.toString();
    } else {
        // To this to reformat encoded character and remove html tags like <br>
        return Html.fromHtml(value.toString()).toString();
    }
}

Example 33

Project: sisob-academic-data-extractor-master File: ResearchersPagePostProcessor.java View source code

/**     
     * 
     * @param props
     * @param path
     * @param nameFile
     * @param newNameFile
     */
public static void cleanFile(CleanerProperties props, String path, String nameFile, String newNameFile) {
    File fileURL = new File(path + File.separator + nameFile);
    // do parsing
    try {
        TagNode tagNode = new HtmlCleaner(props).clean(fileURL, "utf-8");
        // serialize to xml file
        new CompactHtmlSerializer(props).writeToFile(tagNode, path + File.separator + newNameFile, "UTF-8");
        ProjectLogger.LOGGER.info(path + File.separator + nameFile + " cleaned!");
    } catch (Exception ex) {
        ProjectLogger.LOGGER.warn(ex.getMessage() + " " + path + File.separator + nameFile + " NOT FOUND!");
    }
}

Example 34

Project: StackX-master File: MarkdownFormatter.java View source code

private static String clean(String markdownText) throws IOException {
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerTransformations transformations = new CleanerTransformations();
    transformations.addTransformation(new TagTransformation(Tags.BR, Tags.BR + "/", true));
    cleaner.setTransformations(transformations);
    TagNode node = cleaner.clean(markdownText);
    SimpleHtmlSerializer serializer = new SimpleHtmlSerializer(cleaner.getProperties());
    serializer.write(node, new StringWriter(), HTTP.UTF_8);
    return serializer.getAsString(node);
}

Example 35

Project: xwiki-commons-master File: DefaultHTMLCleaner.java View source code

@Override
public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration) {
    Document result;
    // Note: Instantiation of an HtmlCleaner object is cheap so there's no need to cache an instance of it,
    // especially since this makes it extra safe with regards to multithreading (even though HTML Cleaner is
    // already supposed to be thread safe).
    CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration);
    HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties);
    TagNode cleanedNode;
    try {
        cleanedNode = cleaner.clean(originalHtmlContent);
    } catch (Exception e) {
        throw new RuntimeException("Unhandled error when cleaning HTML", e);
    }
    try {
        // Ideally we would use SF's HTMLCleaner DomSerializer but there are outstanding issues with it, so we're
        // using a custom XWikiDOMSerializer (see its javadoc for more details).
        // Replace by the following when fixed:
        //   result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode);
        cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
        result = new XWikiDOMSerializer(cleanerProperties, false).createDOM(getAvailableDocumentBuilder(), cleanedNode);
    } catch (ParserConfigurationException ex) {
        throw new RuntimeException("Error while serializing TagNode into w3c dom.", ex);
    }
    // Finally apply filters.
    for (HTMLFilter filter : configuration.getFilters()) {
        filter.filter(result, configuration.getParameters());
    }
    return result;
}

Example 36

Project: fenixedu-academic-master File: ProcessCandidacyPrintAllDocumentsFilter.java View source code

private String clean(String dirtyHtml) {
    try {
        HtmlCleaner cleaner = new HtmlCleaner();
        TagNode root = cleaner.clean(dirtyHtml);
        return new SimpleHtmlSerializer(cleaner.getProperties()).getAsString(root);
    } catch (HtmlCleanerException e) {
        logger.error(e.getMessage(), e);
    }
    return StringUtils.EMPTY;
}

Example 37

Project: LimeWire-Pirate-Edition-master File: TorrentWebSearch.java View source code

/**
     * Extracts all uris from <code>htmlFile</code> that are the targets of anchor
     * elements and could be potential torrent uris.
     */
List<URI> extractTorrentUriCandidates(File htmlFile, URI referrer) throws IOException {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode tagNode = cleaner.clean(htmlFile);
    @SuppressWarnings("unchecked") List<TagNode> anchors = tagNode.getElementListHavingAttribute("href", true);
    List<URI> candidates = new ArrayList<URI>(anchors.size());
    for (TagNode node : anchors) {
        if (!"a".equalsIgnoreCase(node.getName())) {
            continue;
        }
        String href = node.getAttributeByName("href");
        LOG.debugf("resolving: {0} with {1}", href, referrer);
        try {
            URI link = URIUtils.toURI(href);
            if (canBeTorrentUri(link)) {
                candidates.add(link);
            } else {
                link = org.apache.http.client.utils.URIUtils.resolve(referrer, link);
                if (canBeTorrentUri(link)) {
                    candidates.add(link);
                } else {
                    LOG.debugf("not a potential torrent link: {0}", link);
                }
            }
        } catch (URISyntaxException e) {
            LOG.debug("error parsing", e);
        }
    }
    return candidates;
}

Example 38

Project: mitbbs-android-master File: FetchWebpage.java View source code

/**
 * get previous page and next page for board, notice the parse get the second tag by (TagNode)nodes.get(1);
 * @param htmlCleaner
 * @param mitbbspageURL
 * @param encoding
 * @param tagName
 * @return
 */
public ArrayList<URL> getBoardLinks(HtmlCleaner htmlCleaner, URL mitbbspageURL, String encoding, String tagName) {
    Log.i(TAG, "getBoardLinks");
    ArrayList<URL> links = new ArrayList<URL>();
    try {
        URLConnection conn = mitbbspageURL.openConnection();
        TagNode node = htmlCleaner.clean(new InputStreamReader(conn.getInputStream(), encoding));
        List nodes = node.getElementListByAttValue("id", tagName, true, true);
        Log.i(TAG, "getBoardLinks nodes.length=" + String.valueOf(nodes.size()));
        TagNode pnode = (TagNode) nodes.get(1);
        List linklist = pnode.getElementListByName("a", true);
        for (int i = 0; i < linklist.size(); i++) {
            TagNode pre = (TagNode) linklist.get(i);
            links.add(new URL(moibleBaseURL + pre.getAttributeByName("href")));
        }
        if (links.size() < 4 && links.size() == 2) {
            URL obj = links.get(0);
            URL obj1 = links.get(1);
            links.set(0, null);
            links.add(1, null);
            links.add(2, obj);
            links.add(3, obj1);
        }
    } catch (Exception ex) {
        Log.i(TAG, "getBoardLinks" + ex.toString());
    }
    return links;
}

Example 39

Project: stanbol-master File: DomSerializer2.java View source code

public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(props.isNamespacesAware());
    Document document = factory.newDocumentBuilder().newDocument();
    Element rootElement = document.createElement(rootNode.getName());
    ;
    document.appendChild(rootElement);
    setAttributes(rootNode, rootElement);
    createSubnodes(document, rootElement, rootNode.getChildren());
    return document;
}

Example 40

Project: fastcatsearch-master File: ReadabilityExtractor.java View source code

/**
	 * htmlcleanerë¡œ html stringì?„ xml stringìœ¼ë¡œ ë°”ê¿”ì£¼ëŠ” ë©”ì†Œë“œ.
	 * @param source
	 * @return
	 */
private String toXML(String source) {
    try {
        CleanerProperties props = new CleanerProperties();
        props.setTranslateSpecialEntities(true);
        props.setOmitComments(true);
        props.setPruneTags("script,style");
        // namespaceë¥¼ ë¬´ì‹œí•œë‹¤.
        props.setNamespacesAware(false);
        props.setAdvancedXmlEscape(true);
        props.setTranslateSpecialEntities(true);
        HtmlCleaner cl = new HtmlCleaner(props);
        TagNode tagNode = cl.clean(source);
        source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
    } catch (IOException e) {
        logger.error("", e);
    }
    return source;
}

Example 41

Project: weblounge-master File: XhtmlRendererPagePreviewGenerator.java View source code

/**
   * {@inheritDoc}
   * 
   * @see ch.entwine.weblounge.common.content.PreviewGenerator#createPreview(ch.entwine.weblounge.common.content.Resource,
   *      ch.entwine.weblounge.common.site.Environment,
   *      ch.entwine.weblounge.common.language.Language,
   *      ch.entwine.weblounge.common.content.image.ImageStyle, String,
   *      java.io.InputStream, java.io.OutputStream)
   */
public void createPreview(Resource<?> resource, Environment environment, Language language, ImageStyle style, String format, InputStream is, OutputStream os) throws IOException {
    if (!isRenderingEnvironmentSane) {
        logger.debug("Skipping page preview rendering as environment is not sane");
        return;
    }
    if (resource == null)
        throw new IllegalArgumentException("Resource cannot be null");
    ImagePreviewGenerator imagePreviewGenerator = null;
    synchronized (previewGenerators) {
        if (previewGenerators.size() == 0) {
            logger.debug("Unable to generate page previews since no image renderer is available");
            return;
        }
        imagePreviewGenerator = previewGenerators.get(0);
    }
    ResourceURI uri = resource.getURI();
    Site site = uri.getSite();
    String html = null;
    try {
        URL pageURL = new URL(UrlUtils.concat(site.getHostname(environment).toExternalForm(), PAGE_HANDLER_PREFIX, uri.getIdentifier()));
        html = render(pageURL, site, environment, language, resource.getVersion());
        if (StringUtils.isBlank(html)) {
            logger.warn("Error rendering preview of page " + uri.getPath());
            return;
        }
        html = HTMLUtils.escapeHtml(HTMLUtils.unescape(html));
    } catch (ServletException e) {
        logger.warn("Error rendering page " + uri.getPath(), e);
        throw new IOException(e);
    }
    // Try to convert html to xhtml
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties xhtmlProperties = cleaner.getProperties();
    TagNode xhtmlNode = cleaner.clean(html);
    if (xhtmlNode == null) {
        logger.warn("Error creating well-formed document from page {}", resource);
        return;
    }
    File xhtmlFile = null;
    is = new ByteArrayInputStream(html.getBytes("UTF-8"));
    // generator can only handle files.
    try {
        xhtmlFile = File.createTempFile("xhtml", ".xml");
        Serializer xhtmlSerializer = new SimpleXmlSerializer(xhtmlProperties);
        xhtmlSerializer.writeToFile(xhtmlNode, xhtmlFile.getAbsolutePath(), "UTF-8");
    } catch (IOException e) {
        logger.error("Error creating temporary copy of file content at " + xhtmlFile, e);
        FileUtils.deleteQuietly(xhtmlFile);
        throw e;
    } finally {
        IOUtils.closeQuietly(is);
    }
    File imageFile = File.createTempFile("xhtml-preview", "." + PREVIEW_FORMAT);
    FileOutputStream imageFos = null;
    // Render the page and write back to client
    try {
        int screenshotWidth = DEFAULT_SCREENSHOT_WIDTH;
        int screenshotHeight = DEFAULT_SCREENSHOT_HEIGHT;
        if (style != null && style.getWidth() > 0 && style.getHeight() > 0) {
            screenshotHeight = (int) ((float) screenshotWidth / (float) style.getWidth() * style.getHeight());
        }
        // Create the renderer. Due to a synchronization bug in the software,
        // this needs to be synchronized
        Java2DRenderer renderer = null;
        try {
            synchronized (this) {
                renderer = new Java2DRenderer(xhtmlFile, screenshotWidth, screenshotHeight);
            }
        } catch (Throwable t) {
            if (isRenderingEnvironmentSane) {
                logger.warn("Error creating Java 2D renderer for previews: {}" + t.getMessage());
                logger.warn("Page preview rendering will be switched off");
                isRenderingEnvironmentSane = false;
            }
            logger.debug("Error creating Java 2D renderer for preview of page {}: {}" + uri.getPath(), t.getMessage());
            return;
        }
        // Configure the renderer
        renderer.getSharedContext().setBaseURL(site.getHostname().toExternalForm());
        renderer.getSharedContext().setInteractive(false);
        // Make sure the renderer is using a user agent that will correctly
        // resolve urls
        WebloungeUserAgent agent = userAgents.get(site.getIdentifier());
        if (agent == null) {
            agent = new WebloungeUserAgent(site.getHostname().getURL());
            userAgents.put(site.getIdentifier(), agent);
        }
        renderer.getSharedContext().setUserAgentCallback(agent);
        // Render the page to an image
        BufferedImage img = renderer.getImage();
        FSImageWriter imageWriter = new FSImageWriter(PREVIEW_FORMAT);
        imageFos = new FileOutputStream(imageFile);
        imageWriter.write(img, imageFos);
    } catch (IOException e) {
        logger.error("Error creating temporary copy of file content at " + xhtmlFile, e);
        throw e;
    } catch (XRRuntimeException e) {
        logger.warn("Error rendering page content at " + uri + ": " + e.getMessage());
        throw e;
    } catch (HeadlessException e) {
        logger.warn("Headless error while trying to render page preview: " + e.getMessage());
        logger.warn("Page preview rendering will be switched off");
        isRenderingEnvironmentSane = false;
        throw e;
    } catch (Throwable t) {
        logger.warn("Error rendering page content at " + uri + ": " + t.getMessage(), t);
        throw new IOException(t);
    } finally {
        IOUtils.closeQuietly(imageFos);
        FileUtils.deleteQuietly(xhtmlFile);
    }
    FileInputStream imageIs = null;
    // Scale the image to the correct size
    try {
        imageIs = new FileInputStream(imageFile);
        imagePreviewGenerator.createPreview(resource, environment, language, style, PREVIEW_FORMAT, imageIs, os);
    } catch (IOException e) {
        logger.error("Error creating temporary copy of file content at " + xhtmlFile, e);
        throw e;
    } catch (Throwable t) {
        logger.warn("Error scaling page preview at " + uri + ": " + t.getMessage(), t);
        throw new IOException(t);
    } finally {
        IOUtils.closeQuietly(imageIs);
        FileUtils.deleteQuietly(imageFile);
    }
}

Example 42

Project: book_reader_lib-master File: BookView.java View source code

@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end) {
    final String href = node.getAttributeByName("href");
    if (href == null) {
        return;
    }
    // First check if it should be a normal URL link
    for (String protocol : this.externalProtocols) {
        if (href.toLowerCase().startsWith(protocol)) {
            builder.setSpan(new URLSpan(href), start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
            return;
        }
    }
    // If not, consider it an internal nav link.
    ClickableSpan span = new ClickableSpan() {

        @Override
        public void onClick(View widget) {
            navigateTo(spine.resolveHref(href));
        }
    };
    builder.setSpan(span, start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
}

Example 43

Project: feedscribe-master File: FeedManager.java View source code

public String cleanDescription(TagNode node) {
    final StringBuilder description = new StringBuilder();
    node.traverse(new TagNodeVisitor() {

        @Override
        public boolean visit(TagNode tagNode, HtmlNode htmlNode) {
            if (htmlNode instanceof ContentNode) {
                ContentNode contentNode = (ContentNode) htmlNode;
                htmlUnescapeInto(contentNode.getContent(), description);
            }
            return true;
        }
    });
    return description.toString().trim();
}

Example 44

Project: kolmafia-master File: RuntimeLibrary.java View source code

public static Value xpath(Interpreter interpreter, final Value html, final Value xpath) {
    HtmlCleaner cleaner = HTMLParserUtils.configureDefaultParser();
    TagNode doc;
    try {
        doc = cleaner.clean(html.toString());
    } catch (IOException e) {
        StaticEntity.printStackTrace(e);
        throw interpreter.runtimeException("something went wrong while cleaning html");
    }
    Object[] result;
    try {
        result = doc.evaluateXPath(xpath.toString());
    } catch (XPatherException e) {
        throw interpreter.runtimeException("invalid xpath expression");
    }
    AggregateType type = new AggregateType(DataTypes.STRING_TYPE, result.length);
    ArrayValue value = new ArrayValue(type);
    // convert Tagnode objects to strings consisting of their inner HTML
    SimpleXmlSerializer serializer = new SimpleXmlSerializer(cleaner.getProperties());
    for (int i = 0; i < result.length; i++) {
        Object ob = result[i];
        if (ob instanceof TagNode) {
            TagNode tag = (TagNode) ob;
            try {
                result[i] = serializer.getXmlAsString(tag);
            } catch (IOException e) {
                StaticEntity.printStackTrace(e);
                throw interpreter.runtimeException("something went wrong while serializing to html");
            }
        }
        value.aset(new Value(i), new Value(result[i].toString()));
    }
    return value;
}

Example 45

Project: ttr-master File: ArticleFragment.java View source code

/**
	 * Using a small html parser with a visitor which goes through the html I extract the alt-attribute from the
	 * content. If nothing is found it is left as null and the menu should'nt contain the item to display the caption.
	 *
	 * @param extra the
	 * @return the alt-text or null if none was found.
	 */
private String getAltTextForImageUrl(String extra) {
    if (content == null || !content.contains(extra))
        return null;
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(content);
    MyTagNodeVisitor tnv = new MyTagNodeVisitor(extra);
    node.traverse(tnv);
    if (tnv.alt == null)
        return null;
    return Html.fromHtml(tnv.alt).toString();
}

Example 46

Project: ttrss-reader-fork-master File: ArticleFragment.java View source code

/**
	 * Using a small html parser with a visitor which goes through the html I extract the alt-attribute from the
	 * content. If nothing is found it is left as null and the menu should'nt contain the item to display the caption.
	 *
	 * @param extra the
	 * @return the alt-text or null if none was found.
	 */
private String getAltTextForImageUrl(String extra) {
    if (content == null || !content.contains(extra))
        return null;
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(content);
    MyTagNodeVisitor tnv = new MyTagNodeVisitor(extra);
    node.traverse(tnv);
    if (tnv.alt == null)
        return null;
    return Html.fromHtml(tnv.alt).toString();
}

Example 47

Project: TweetTopics2.0-master File: LinksUtils.java View source code

public static InfoLink getInfoTweet(String link) {
    String originalLink = link;
    // acortadores
    link = largeLink(link);
    // si es un url media
    if (CacheData.getInstance().existURLMedia(link)) {
        Utils.URLContent content = CacheData.getInstance().getURLMedia(link);
        InfoLink il = new InfoLink();
        il.setService("Twitter Pic");
        il.setExtensiveInfo(true);
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb(content.linkMediaThumb);
        il.setLinkImageLarge(content.linkMediaLarge);
        return il;
    }
    // es una busqueda
    if (link.startsWith(Utils.URL_QR)) {
        InfoLink il = new InfoLink();
        il.setService("tweettopics-qr");
        il.setType(Utils.TYPE_LINK_TWEETOPICS_QR);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        return il;
    }
    if (link.startsWith(Utils.URL_SHARE_THEME_QR)) {
        InfoLink il = new InfoLink();
        il.setService("tweettopics-theme");
        il.setType(Utils.TYPE_LINK_TWEETOPICS_THEME);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        return il;
    }
    if ((link.endsWith(".jpg")) || (link.endsWith(".png")) || (link.endsWith(".gif")) || (link.endsWith(".bmp"))) {
        InfoLink il = new InfoLink();
        il.setExtensiveInfo(true);
        il.setService("Web");
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb(link);
        il.setLinkImageLarge(link);
        return il;
    }
    if (link.contains("imgur.com")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        String imgThumb = "http://i.imgur.com/" + id + "b.jpg";
        String imgLarge = "http://i.imgur.com/" + id + ".jpg";
        InfoLink il = new InfoLink();
        il.setService("Imgur");
        il.setExtensiveInfo(true);
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb(imgThumb);
        il.setLinkImageLarge(imgLarge);
        return il;
    }
    // lightbox
    if (link.contains("lightbox")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        InfoLink il = new InfoLink();
        il.setService("Lightbox");
        il.setExtensiveInfo(true);
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb("http://lightbox.com/show/thumb/" + id);
        il.setLinkImageLarge("http://lightbox.com/show/large/" + id);
        return il;
    }
    // twitpic
    if (link.contains("twitpic")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        InfoLink il = new InfoLink();
        il.setService("Twitpic");
        il.setExtensiveInfo(true);
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb("http://twitpic.com/show/mini/" + id);
        il.setLinkImageLarge("http://twitpic.com/show/large/" + id);
        return il;
    }
    // picplz
    if (link.contains("picplz")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        InfoLink il = new InfoLink();
        il.setService("Picplz");
        il.setExtensiveInfo(true);
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb("http://picplz.com/" + id + "/thumb/200");
        il.setLinkImageLarge("http://picplz.com/" + id + "/thumb/400");
        return il;
    }
    // img.ly
    if (link.contains("img.ly")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        InfoLink il = new InfoLink();
        il.setService("Img.ly");
        il.setExtensiveInfo(true);
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb("http://img.ly/show/thumb/" + id);
        il.setLinkImageLarge("http://img.ly/show/medium/" + id);
        return il;
    }
    if (link.contains("vvcap")) {
        String image = link.replace(".htp", ".png");
        InfoLink il = new InfoLink();
        il.setExtensiveInfo(true);
        il.setService("Vvcap.net");
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb(image);
        il.setLinkImageLarge(image);
        return il;
    }
    if (link.contains("yfrog")) {
        InfoLink il = new InfoLink();
        il.setService("Yfrog");
        il.setExtensiveInfo(true);
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb(link + ".th.jpg");
        il.setLinkImageLarge(link + ":android");
        Log.d(Utils.TAG, "yfrog (\"+link+\"): " + link + ".th.jpg" + " -- " + link + ":android");
        return il;
    }
    // twitvid
    if (link.contains("twitvid")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        InfoLink il = new InfoLink();
        il.setService("twitvid");
        il.setExtensiveInfo(true);
        il.setType(1);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setTitle("Twitvid");
        il.setDurationVideo(0);
        il.setLinkImageThumb("http://images2.twitvid.com/" + id + ".jpg");
        il.setLinkImageLarge("http://images2.twitvid.com/" + id + ".jpg");
        return il;
    }
    /*
        if (link.contains("flic.kr")) {
            String idbase58 = link.substring(link.lastIndexOf("/")+1);
            String id = String.valueOf(alphaToNumber(idbase58));

            String urlApi = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=6ce2af123df7dd2a7dab086f086e9824&photo_id="+id+"&format=json&nojsoncallback=1";

            Log.d(Utils.TAG, "urlApi: (" + link + ") " + urlApi);

            String farmId="";
            String serverId="";
            String secret="";

            HttpGet request = new HttpGet(urlApi);
            HttpClient client = new DefaultHttpClient();
            HttpResponse httpResponse;
            try {
                httpResponse = client.execute(request);
                String xml = EntityUtils.toString(httpResponse.getEntity());
                JSONObject jsonObject = new JSONObject(xml);
                if (jsonObject!=null) {
                    if (jsonObject.getJSONObject("photo")!=null) {
                        farmId = jsonObject.getJSONObject("photo").getString("farm");
                        serverId = jsonObject.getJSONObject("photo").getString("server");
                        secret = jsonObject.getJSONObject("photo").getString("secret");
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }

            if (farmId!="") {
                String imgThumb = "http://farm"+farmId+".static.flickr.com/"+serverId+"/"+id+"_"+secret+"_s.jpg";
                String imgLarge = "http://farm"+farmId+".static.flickr.com/"+serverId+"/"+id+"_"+secret+".jpg";
                Bitmap bmp = getBitmap(imgThumb, HEIGHT_THUMB);
                if (bmp!=null) {
                    InfoLink il = new InfoLink();
                    il.setBitmapThumb(bmp);
                    il.setService("Flickr");
                    il.setType(0);
                    il.setLink(link);
                    il.setOriginalLink(originalLink);
                    il.setLinkImageThumb(imgThumb);
                    il.setLinkImageLarge(imgLarge);
                    return il;
                }
            }

        }
        */
    if (link.contains("mytubo.net")) {
        String image = "";
        try {
            HtmlCleaner cleaner = new HtmlCleaner();
            CleanerProperties props = cleaner.getProperties();
            props.setAllowHtmlInsideAttributes(true);
            props.setAllowMultiWordAttributes(true);
            props.setRecognizeUnicodeChars(true);
            props.setOmitComments(true);
            URL url = new URL(link);
            URLConnection conn;
            conn = url.openConnection();
            InputStreamReader isr = new InputStreamReader(conn.getInputStream());
            TagNode node = cleaner.clean(isr);
            Object[] objMeta = node.evaluateXPath("//img[@id='originPic']");
            if (objMeta.length > 0) {
                TagNode info_node = (TagNode) objMeta[0];
                image = URLDecoder.decode(info_node.getAttributeByName("src").toString().trim());
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XPatherException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
        InfoLink il = new InfoLink();
        il.setExtensiveInfo(true);
        il.setService("Mytubo.net");
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb(image);
        il.setLinkImageLarge(image);
        return il;
    }
    // instagr.am
    if (link.contains("instagr.am")) {
        String image = "";
        try {
            HtmlCleaner cleaner = new HtmlCleaner();
            CleanerProperties props = cleaner.getProperties();
            props.setAllowHtmlInsideAttributes(true);
            props.setAllowMultiWordAttributes(true);
            props.setRecognizeUnicodeChars(true);
            props.setOmitComments(true);
            props.setUseEmptyElementTags(true);
            /*
                URL url = new URL(link);
                URLConnection conn;
                conn = url.openConnection();
                InputStreamReader isr = new InputStreamReader(conn.getInputStream());    */
            TagNode node = cleaner.clean(getURIContent(link));
            Object[] objMeta = node.evaluateXPath("//img[@class='photo']");
            if (objMeta.length > 0) {
                TagNode info_node = (TagNode) objMeta[0];
                image = URLDecoder.decode(info_node.getAttributeByName("src").toString().trim());
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XPatherException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
        InfoLink il = new InfoLink();
        il.setExtensiveInfo(true);
        il.setService("Instagr.am");
        il.setType(0);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setLinkImageThumb(image);
        il.setLinkImageLarge(image);
        Log.d(Utils.TAG, "Instagr.am (" + link + "): " + image);
        return il;
    }
    if (link.contains("plixi") || link.contains("lockerz")) {
        String strURL = "http://api.plixi.com/api/tpapi.svc/metadatafromurl?url=" + link;
        try {
            Document doc = null;
            try {
                URL url;
                URLConnection urlConn = null;
                url = new URL(strURL);
                urlConn = url.openConnection();
                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
                DocumentBuilder db = dbf.newDocumentBuilder();
                doc = db.parse(urlConn.getInputStream());
            } catch (IOException ioe) {
            } catch (ParserConfigurationException pce) {
            } catch (SAXException se) {
            }
            if (doc != null) {
                try {
                    String imgThumb = doc.getElementsByTagName("ThumbnailUrl").item(0).getFirstChild().getNodeValue();
                    String imgLarge = doc.getElementsByTagName("MediumImageUrl").item(0).getFirstChild().getNodeValue();
                    if (!imgThumb.equals("")) {
                        InfoLink il = new InfoLink();
                        il.setService("Lockerz");
                        il.setType(0);
                        il.setLink(link);
                        il.setOriginalLink(originalLink);
                        il.setLinkImageThumb(imgThumb);
                        il.setLinkImageLarge(imgLarge);
                        return il;
                    }
                } catch (Exception e) {
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    if (link.contains("twitgoo")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        String strURL = "http://twitgoo.com/api/message/info/" + id;
        Document doc = null;
        try {
            URL url;
            URLConnection urlConn = null;
            url = new URL(strURL);
            urlConn = url.openConnection();
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            doc = db.parse(urlConn.getInputStream());
        } catch (IOException ioe) {
        } catch (ParserConfigurationException pce) {
        } catch (SAXException se) {
        }
        if (doc != null) {
            try {
                String imgThumb = doc.getElementsByTagName("thumburl").item(0).getFirstChild().getNodeValue();
                String imgLarge = doc.getElementsByTagName("imageurl").item(0).getFirstChild().getNodeValue();
                if (!imgThumb.equals("")) {
                    InfoLink il = new InfoLink();
                    il.setService("Twitgoo");
                    il.setExtensiveInfo(true);
                    il.setType(0);
                    il.setLink(link);
                    il.setOriginalLink(originalLink);
                    il.setLinkImageThumb(imgThumb);
                    il.setLinkImageLarge(imgLarge);
                    return il;
                }
            } catch (Exception e) {
            }
        }
    }
    if (link.contains("vimeo")) {
        String id = link.substring(link.lastIndexOf("/") + 1);
        String strURL = "http://vimeo.com/api/v2/video/" + id + ".xml";
        Document doc = null;
        try {
            URL url;
            URLConnection urlConn = null;
            url = new URL(strURL);
            urlConn = url.openConnection();
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            doc = db.parse(urlConn.getInputStream());
        } catch (IOException ioe) {
        } catch (ParserConfigurationException pce) {
        } catch (SAXException se) {
        }
        if (doc != null) {
            try {
                String imgThumb = doc.getElementsByTagName("thumbnail_small").item(0).getFirstChild().getNodeValue();
                String imgLarge = doc.getElementsByTagName("thumbnail_large").item(0).getFirstChild().getNodeValue();
                String title = doc.getElementsByTagName("title").item(0).getFirstChild().getNodeValue();
                int duration = Integer.parseInt(doc.getElementsByTagName("duration").item(0).getFirstChild().getNodeValue());
                if (!imgThumb.equals("")) {
                    InfoLink il = new InfoLink();
                    il.setService("Vimeo");
                    il.setExtensiveInfo(true);
                    il.setType(1);
                    il.setLink(link);
                    il.setOriginalLink(originalLink);
                    il.setTitle(title);
                    il.setDurationVideo(duration);
                    il.setLinkImageThumb(imgThumb);
                    il.setLinkImageLarge(imgLarge);
                    return il;
                }
            } catch (Exception e) {
            }
        }
    }
    if ((link.contains("youtube")) || (link.contains("youtu.be"))) {
        String id = "";
        if (link.contains("youtube")) {
            id = link.substring(link.lastIndexOf("v=") + 2);
            if (id.contains("&")) {
                id = id.substring(0, id.indexOf("&"));
            }
        }
        if (link.contains("youtu.be")) {
            id = link.substring(link.lastIndexOf("/") + 1);
            if (id.contains("?")) {
                id = id.substring(0, id.indexOf("?"));
            }
        }
        String imgThumb = "http://img.youtube.com/vi/" + id + "/2.jpg";
        String imgLarge = "http://img.youtube.com/vi/" + id + "/0.jpg";
        String strURL = "http://gdata.youtube.com/feeds/api/videos/" + id;
        Document doc = null;
        try {
            URL url;
            URLConnection urlConn = null;
            url = new URL(strURL);
            urlConn = url.openConnection();
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            doc = db.parse(urlConn.getInputStream());
        } catch (IOException ioe) {
        } catch (ParserConfigurationException pce) {
        } catch (SAXException se) {
        }
        String title = "Youtube";
        int duration = 0;
        try {
            if (doc != null) {
                title = doc.getElementsByTagName("title").item(0).getFirstChild().getNodeValue();
                duration = Integer.parseInt(doc.getElementsByTagName("yt:duration").item(0).getAttributes().getNamedItem("seconds").getNodeValue());
            }
        } catch (Exception e) {
        }
        InfoLink il = new InfoLink();
        il.setService("Youtube");
        il.setExtensiveInfo(true);
        il.setType(1);
        il.setLink(link);
        il.setOriginalLink(originalLink);
        il.setTitle(title);
        il.setDurationVideo(duration);
        il.setLinkImageThumb(imgThumb);
        il.setLinkImageLarge(imgLarge);
        return il;
    }
    // si no es una imagen, es un enlace web
    InfoLink il = new InfoLink();
    il.setService("web");
    il.setType(2);
    il.setLink(link);
    il.setOriginalLink(originalLink);
    il.setTitle(originalLink);
    return il;
}

Example 48

Project: BotLibre-master File: Http.java View source code

/**
	 * Convert the HTML input stream into DOM parsable XHTML.
	 */
public StringReader convertToXHTML(InputStream input) throws IOException {
    StringWriter output = new StringWriter();
    /*int next = input.read();
		while (next != -1) {
			output.write(next);
			next = input.read();
		}
		String result = output.toString();
		System.out.println(result);*/
    TagNode node = getHtmlCleaner().clean(input, "UTF-8");
    //TagNode node = getHtmlCleaner().clean(result);
    node.serialize(new SimpleXmlSerializer(getHtmlCleaner().getProperties()), output);
    output.flush();
    String xhtml = output.toString();
    return new StringReader(xhtml);
}

Example 49

Project: concourse-connect-master File: HTMLToWikiUtils.java View source code

public static String htmlToWiki(String html, String contextPath, int projectId) throws Exception {
    // Strip the nbsp because it gets converted to unicode
    html = StringUtils.replace(html, " ", " ");
    // Take the html create DOM for parsing
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    TagNode node = cleaner.clean(html);
    Document document = new DomSerializer(props, true).createDOM(node);
    if (LOG.isTraceEnabled()) {
        LOG.trace(html);
    }
    // Process each node and output the wiki equivalent
    StringBuffer sb = new StringBuffer();
    ArrayList<Node> nodeList = new ArrayList<Node>();
    for (int i = 0; i < document.getChildNodes().getLength(); i++) {
        Node n = document.getChildNodes().item(i);
        nodeList.add(n);
    }
    processChildNodes(nodeList, sb, 0, true, true, false, "", contextPath, projectId);
    if (sb.length() > 0) {
        String content = sb.toString().trim();
        if (content.contains("'")) {
            // Determine if this is where the ' is being introduced
            content = StringUtils.replace(content, "'", "'");
        }
        if (!content.endsWith(CRLF)) {
            return content + CRLF;
        } else {
            return content;
        }
    } else {
        return "";
    }
}

Example 50

Project: iaf-master File: XmlUtils.java View source code

public static String toXhtml(String htmlString) {
    String xhtmlString = null;
    if (StringUtils.isNotEmpty(htmlString)) {
        xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim());
        if (xhtmlString.startsWith("<html>") || xhtmlString.startsWith("<html ")) {
            CleanerProperties props = new CleanerProperties();
            HtmlCleaner cleaner = new HtmlCleaner(props);
            TagNode tagNode = cleaner.clean(xhtmlString);
            xhtmlString = new SimpleXmlSerializer(props).getXmlAsString(tagNode);
        }
    }
    return xhtmlString;
}

Example 51

Project: cos598b-master File: MessageCompose.java View source code

/**
     * Build and populate the UI with the quoted message.
     *
     * @param showQuotedText
     *         {@code true} if the quoted text should be shown, {@code false} otherwise.
     *
     * @throws MessagingException
     */
private void populateUIWithQuotedMessage(boolean showQuotedText) throws MessagingException {
    if (mMessageFormat == MessageFormat.AUTO) {
        mMessageFormat = MimeUtility.findFirstPartByMimeType(mSourceMessage, "text/html") == null ? MessageFormat.TEXT : MessageFormat.HTML;
    }
    // TODO -- I am assuming that mSourceMessageBody will always be a text part.  Is this a safe assumption?
    // Handle the original message in the reply
    // If we already have mSourceMessageBody, use that.  It's pre-populated if we've got crypto going on.
    String content = mSourceMessageBody != null ? mSourceMessageBody : getBodyTextFromMessage(mSourceMessage, mMessageFormat);
    if (mMessageFormat == MessageFormat.HTML) {
        // closing tags such as </div>, </span>, </table>, </pre> will be cut off.
        if (mAccount.isStripSignature() && (ACTION_REPLY_ALL.equals(getIntent().getAction()) || ACTION_REPLY.equals(getIntent().getAction()))) {
            Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
            if (dashSignatureHtml.find()) {
                Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
                Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
                List<Integer> start = new ArrayList<Integer>();
                List<Integer> end = new ArrayList<Integer>();
                while (blockquoteStart.find()) {
                    start.add(blockquoteStart.start());
                }
                while (blockquoteEnd.find()) {
                    end.add(blockquoteEnd.start());
                }
                if (start.size() != end.size()) {
                    Log.d(K9.LOG_TAG, "There are " + start.size() + " <blockquote> tags, but " + end.size() + " </blockquote> tags. Refusing to strip.");
                } else if (start.size() > 0) {
                    // Ignore quoted signatures in blockquotes.
                    dashSignatureHtml.region(0, start.get(0));
                    if (dashSignatureHtml.find()) {
                        // before first <blockquote>.
                        content = content.substring(0, dashSignatureHtml.start());
                    } else {
                        for (int i = 0; i < start.size() - 1; i++) {
                            // within blockquotes.
                            if (end.get(i) < start.get(i + 1)) {
                                dashSignatureHtml.region(end.get(i), start.get(i + 1));
                                if (dashSignatureHtml.find()) {
                                    content = content.substring(0, dashSignatureHtml.start());
                                    break;
                                }
                            }
                        }
                        if (end.get(end.size() - 1) < content.length()) {
                            // after last </blockquote>.
                            dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                            if (dashSignatureHtml.find()) {
                                content = content.substring(0, dashSignatureHtml.start());
                            }
                        }
                    }
                } else {
                    // No blockquotes found.
                    content = content.substring(0, dashSignatureHtml.start());
                }
            }
            // Fix the stripping off of closing tags if a signature was stripped,
            // as well as clean up the HTML of the quoted message.
            HtmlCleaner cleaner = new HtmlCleaner();
            CleanerProperties properties = cleaner.getProperties();
            // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
            properties.setNamespacesAware(false);
            properties.setAdvancedXmlEscape(false);
            properties.setOmitXmlDeclaration(true);
            properties.setOmitDoctypeDeclaration(false);
            properties.setTranslateSpecialEntities(false);
            properties.setRecognizeUnicodeChars(false);
            TagNode node = cleaner.clean(content);
            SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
            try {
                content = htmlSerialized.getAsString(node, "UTF8");
            } catch (java.io.IOException ioe) {
                Log.e(K9.LOG_TAG, "Problem cleaning quoted message.", ioe);
            }
        }
        // Add the HTML reply header to the top of the content.
        mQuotedHtmlContent = quoteOriginalHtmlMessage(mSourceMessage, content, mQuoteStyle);
        // Load the message with the reply header.
        mQuotedHTML.setText(mQuotedHtmlContent.getQuotedContent(), "text/html");
        mQuotedText.setText(quoteOriginalTextMessage(mSourceMessage, getBodyTextFromMessage(mSourceMessage, MessageFormat.TEXT), mQuoteStyle));
    } else if (mMessageFormat == MessageFormat.TEXT) {
        if (mAccount.isStripSignature() && (ACTION_REPLY_ALL.equals(getIntent().getAction()) || ACTION_REPLY.equals(getIntent().getAction()))) {
            if (DASH_SIGNATURE_PLAIN.matcher(content).find()) {
                content = DASH_SIGNATURE_PLAIN.matcher(content).replaceFirst("\r\n");
            }
        }
        mQuotedText.setText(quoteOriginalTextMessage(mSourceMessage, content, mQuoteStyle));
    }
    if (showQuotedText) {
        showOrHideQuotedText(QuotedTextMode.SHOW);
    } else {
        showOrHideQuotedText(QuotedTextMode.HIDE);
    }
}

Example 52

Project: xMail-master File: HtmlSanitizer.java View source code

public static String sanitize(String html) {
    TagNode rootNode = HTML_CLEANER.clean(html);
    removeMetaRefresh(rootNode);
    return HTML_SERIALIZER.getAsString(rootNode, "UTF8");
}

Example 53

Project: XDA-One-master File: XDATagHandlers.java View source code

@Override
public void handleTagNode(final TagNode node, final SpannableStringBuilder builder, final int start, final int end, final SpanStack stack) {
    final String src = node.getAttributeByName("src");
    final int newStart = builder.length();
    builder.append("");
    stack.pushSpan(new ImageSpan(src), newStart, builder.length());
}

Example 54

Project: commcare-master File: MarkupUtil.java View source code

@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end, SpanStack spanStack) {
    spanStack.pushSpan(new UnderlineSpan(), start, end);
}

Example 55

Project: WotCrawler-master File: Parser.java View source code

// -------------------- html parsing --------------------
/**
     * Parses a HTML document, transforms it into valid XML using the
     * htmlcleaner-library and returns it as org.w3c.dom.Document
     * @param file the html file to parse
     * @return org.w3c.dom.Document representation of the cleaned HTML file
     * @throws IOException cannot access the file
     * @throws ParserConfigurationException parser configuration invalid
     * @throws SAXException error while parsing (usually invalid xml)
     */
public static Document parseHTML(File file) throws IOException, ParserConfigurationException, SAXException {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode tagNode = cleaner.clean(file);
    String cleanHTML = new SimpleXmlSerializer(cleaner.getProperties()).getAsString(tagNode);
    return buildDOM(cleanHTML);
}