XMLUtils.java example

Explorer

meaningfulweb-master
- meaningfulweb-app
  - src
    - main
      - java
        org
        meaningfulweb
        servlet
        FileFactoryBean.java
        HtmlExtractorController.java
        MeaningfulWebServlet.java
- meaningfulweb-core
  - src
    - main
      - java
        org
        meaningfulweb
        api
        MeaningfulWebObject.java
        MetaContentExtractor.java
        StressTest.java
        cext
        Extract.java
        ExtractForm.java
        ExtractUtils.java
        HtmlContentPipeline.java
        HtmlContentProcessor.java
        HtmlContentProcessorFactory.java
        HtmlExtractor.java
        processors
        ArticleProcessor.java
        BestImageProcessor.java
        BoilerpipeArticleProcessor.java
        DomainSpecifiedImageProcessor.java
        ElementProcessor.java
        FullContentProcessor.java
        HyperlinkProcessor.java
        ImageProcessor.java
        MainContentProcessor.java
        MeaningfulwebCompositeProcessor.java
        OpengraphContentProcessor.java
        ParagraphProcessor.java
        RegexProcessor.java
        ScriptProcessor.java
        SystemCommandProcessor.java
        TwitpicExtractionHandler.java
        XPathCleanerProcessor.java
        XPathProcessor.java
        detector
        DetectorFactory.java
        imgext
        ExtractedContents.java
        ImageFetcher.java
        ImageFilter.java
        ImageHeader.java
        ImageInfo.java
        ImageMeta.java
        ImageProp.java
        ImageSelector.java
        ImageSizeExtractor.java
        util
        EncodingUtils.java
        HTMLOutputter.java
        HtmlExtractUtils.java
        ImageUtil.java
        JDomUtils.java
        JsonUtils.java
        ProcessResponse.java
        ProcessUtils.java
        SystemCommand.java
        TempDirUtils.java
        URIUtils.java
        URLUtil.java
        XMLUtils.java
        domain
        DomainSuffix.java
        DomainSuffixes.java
        DomainSuffixesReader.java
        TopLevelDomain.java
        http
        HttpClientFactory.java
        HttpClientService.java
        HttpComponentsServiceImpl.java
        HttpException.java
        security
        AuthenticationService.java
        ReloadableFileAuthenticationServiceImpl.java
    - test
      - java
        org
        meaningfulweb
        core
        test
        MWCoreTest.java
- meaningfulweb-opengraph
  - src
    - main
      - java
        org
        meaningfulweb
        opengraph
        OGObject.java
        OpenGraphContentHandler.java
        OpenGraphParser.java
        OpenGraphVocabulary.java
    - test
      - java
        org
        meaningfulweb
        opengraph
        test
        Og4jTestCase.java
        Og4jTestSuite.java

package org.meaningfulweb.util;

import java.io.IOException;
import java.io.StringWriter;
import java.util.List;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.jdom.Comment;
import org.jdom.Content;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.Text;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;

public class XMLUtils {

  private static void getText(Content node, StringBuilder builder) {

    if (node instanceof Element) {
      Element elem = (Element)node;
      List<Content> children = elem.getContent();
      if (children != null && children.size() > 0) {
        for (Content child : children) {
          getText(child, builder);
        }
      }
    }
    else if (node instanceof Text) {
      String textVal = StringUtils.trim(((Text)node).getTextNormalize() + " ");
      if (StringUtils.isNotBlank(textVal)) {
        String escaped = StringEscapeUtils.unescapeXml(textVal);
        builder.append(escaped + " ");
      }
    }
    else if (node instanceof Comment) {
      return;
    }

  }

  /**
   * Changes a non-ascii string into an HTML encoded ascii string.
   * 
   * @param notAscii The string to change.
   * 
   * @return The converted string.
   */
  public static String toAscii(String notAscii) {

    StringBuilder builder = new StringBuilder();
    char[] charArray = notAscii.toCharArray();
    for (int i = 0; i < charArray.length; ++i) {
      char a = charArray[i];
      if ((int)a > 255) {
        builder.append("&#" + (int)a + ";");
      }
      else {
        builder.append(a);
      }
    }
    return builder.toString();
  }

  /**
   * This method ensures that the output String has only valid XML unicode
   * characters as specified by the XML 1.0 standard. For reference, please see
   * <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the
   * standard</a>. This method will return an empty String if the input is null
   * or empty.
   * 
   * @param in The String whose non-valid characters we want to remove.
   * @return The in String, stripped of non-valid characters.
   */
  public static String stripNonValidXMLCharacters(String in) {
    StringBuffer out = new StringBuffer(); // Used to hold the output.
    char current; // Used to reference the current character.

    if (in == null || ("".equals(in)))
      return ""; // vacancy test.
    for (int i = 0; i < in.length(); i++) {
      current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here;
                              // it should not happen.
      if ((current == 0x9) || (current == 0xA) || (current == 0xD)
        || ((current >= 0x20) && (current <= 0xD7FF))
        || ((current >= 0xE000) && (current <= 0xFFFD))
        || ((current >= 0x10000) && (current <= 0x10FFFF)))
        out.append(current);
    }
    return out.toString();
  }

  public static String toXml(Document doc) {
    return toXml(doc, "UTF-8");
  }

  public static String toXml(Document doc, String encoding) {

    String htmlstr = null;
    try {
      // write out the xml to a string
      StringWriter writer = new StringWriter();
      Format format = Format.getPrettyFormat();
      format.setExpandEmptyElements(true);
      format.setOmitDeclaration(true);
      format.setEncoding(encoding);
      XMLOutputter out = new XMLOutputter(format);
      out.output(doc, writer);

      // xml processing will escape out certain characters that are legal in
      // html we convert those characters back here to html entity codes instead
      // of xml entitity codes. We also replace unicodeish characters to their
      // html entity equivalents. This helps in displaying with people don't
      // have the correct charset packs installed
      String output = StringEscapeUtils.unescapeXml(writer.toString());
      htmlstr = StringEscapeUtils.unescapeHtml(output);
      writer.close();
    }
    catch (IOException e) {
      // do nothing
    }

    return htmlstr;
  }

  public static String toHtml(Document doc) {
    return toHtml(doc, "UTF-8");
  }

  /**
   * Converts an XML Document object to HTML. This includes pretty printing the
   * document and adding the appropriate DocType headers.
   */
  public static String toHtml(Document doc, String encoding) {

    String htmlstr = null;
    try {
      // write out the xml to a string, without the xml declaration and use the
      // HTML outputter to add in an html doctype
      StringWriter writer = new StringWriter();
      Format format = Format.getPrettyFormat();
      format.setExpandEmptyElements(true);
      format.setOmitDeclaration(true);
      format.setEncoding(encoding);
      HTMLOutputter out = new HTMLOutputter(format);
      out.output(doc, writer);

      // xml processing will escape out certain characters that are legal in
      // html
      // we convert those characters back here to html entity codes instead of
      // xml entitity codes. We also replace unicodeish characters to their html
      // entity equivalents. This helps in displaying with people don't have the
      // correct charset packs installed
      String output = StringEscapeUtils.unescapeXml(writer.toString());
      htmlstr = StringEscapeUtils.unescapeHtml(output);
      writer.close();
    }
    catch (IOException e) {
      // do nothing
    }

    return htmlstr;
  }

  public static String toHtml(Element elem) {
    return toHtml(elem, "UTF-8");
  }

  public static String toHtml(Element elem, String encoding) {

    String htmlstr = null;
    try {
      // write out the xml to a string
      StringWriter writer = new StringWriter();
      Format format = Format.getPrettyFormat();
      format.setExpandEmptyElements(true);
      format.setOmitDeclaration(true);
      format.setEncoding(encoding);
      XMLOutputter out = new XMLOutputter(format);
      out.output(elem, writer);

      // xml processing will escape out certain characters that are legal in
      // html we convert those characters back here to html entity codes instead
      // of xml entitity codes. We also replace unicodeish characters to their
      // html entity equivalents. This helps in displaying with people don't
      // have the correct charset packs installed
      String output = StringEscapeUtils.unescapeXml(writer.toString());
      htmlstr = StringEscapeUtils.unescapeHtml(output);
      writer.close();
    }
    catch (IOException e) {
      // do nothing
    }

    return htmlstr;
  }

  /**
   * Converts an XML Document object to text.
   */
  public static String toText(Document doc) {
    Element rootElem = doc.getRootElement();
    return toText(rootElem);
  }

  /**
   * Converts an XML Document object to text.
   */
  public static String toText(Element elem) {

    // get only the text nodes from the dom
    StringBuilder builder = new StringBuilder();
    List<Content> contents = elem.getContent();
    for (Content child : contents) {
      getText(child, builder);
    }

    String text = builder.toString();
    text = HtmlExtractUtils.removeNewlines(text);
    text = HtmlExtractUtils.removeTags(text);
    text = HtmlExtractUtils.removeContiguousWhitespace(text);
    text = StringEscapeUtils.unescapeXml(text);
    text = StringUtils.trim(StringEscapeUtils.unescapeHtml(text));

    return text;
  }
}