HtmlToTextConverter.java example

Explorer
mulgara-master
- src
  - jar
  - war
    - server-http
      - java
        HttpServer.java
        HttpServerServlet.java
- tools
  - src
    - org
      - mulgara
        tools
        Sparql.java
        Tql.java
/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is the Kowari Metadata Store.
 *
 * The Initial Developer of the Original Code is Plugged In Software Pty
 * Ltd (http://www.pisoftware.com, mailto:info@pisoftware.com). Portions
 * created by Plugged In Software Pty Ltd are Copyright (C) 2001,2002
 * Plugged In Software Pty Ltd. All Rights Reserved.
 *
 * Contributor(s): N/A.
 *
 * [NOTE: The text of this Exhibit A may differ slightly from the text
 * of the notices in the Source Code files of the Original Code. You
 * should use the text of this Exhibit A rather than the text found in the
 * Original Code Source Code for Your Modifications.]
 *
 */

package org.mulgara.util.conversion.html;

// Java 2 standard packages
import java.io.*;
import java.util.*;

import org.apache.log4j.Logger;

/**
 * Converts HTML text into unicode text. <p>
 *
 * The method {@link #convert(String)} can be used to replace non-breaking
 * spaces with normal spaces if the unicode value is not acceptable. </p> <p>
 *
 * The end of line characters can be set using the System property <code>line.separator</code>
 * . The tags for putting end of lines in at can be set using {@link
 * #getEndlineSet()} and {@link #setEndlineSet(Set)}. </p> <p>
 *
 * Whether or not titles and image alt tags are included in the output can be
 * set using {@link #setIncludeTitle(boolean)} and {@link
 * #setIncludeImageAlts(boolean)}. </p>
 *
 * @created 2002-08-01
 *
 * @author Ben Warren
 *
 * @version $Revision: 1.9 $
 *
 * @modified $Date: 2005/01/05 04:59:30 $
 *
 * @maintenanceAuthor $Author: newmana $
 *
 * @company <A href="mailto:info@PIsoftware.com">Plugged In Software</A>
 *
 * @copyright ©2002 <a href="http://www.pisoftware.com/">Plugged In
 *      Software Pty Ltd</a>
 *
 * @licence <a href="{@docRoot}/../../LICENCE">Mozilla Public License v1.1</a>
 */
public class HtmlToTextConverter {
  
  private static final Logger logger = Logger.getLogger(HtmlToTextConverter.class);

  /**
   * Private constructor to stop instanciation.
   */
  private HtmlToTextConverter() {

  }

  /**
   * Set whether or not normal spaces will replace non-breaking space entities.
   *
   * @param value If true normal spaces will replace non-braking spaces
   *      otherwise non-breaking spaces will be used.
   */
  public static void setUseNormalSpace(boolean value) {

    // Normal space
    if (value) {

      Entities.add(" ", 32);
    }

    // Non-breaking
    else {

      Entities.add(" ", 160);
    }
  }

  /**
   * Set the set of tags to put an end of line in for. The tag text excludes the
   * closing angle bracket. eg <br or </h1
   *
   * @param set The end of line tag set.
   */
  public static void setEndlineSet(Set<String> set) {

    HTMLParser.setEndlineSet(set);
  }

  /**
   * Set if the title should be included in the text output.
   *
   * @param include The title will be included if true. Defaults to true.
   */
  public static void setIncludeTitle(boolean include) {

    HTMLParser.setIncludeTitle(include);
  }

  /**
   * Set if image alt tags should be included in the text output.
   *
   * @param include The image alt tags will be included if true. Defaults to
   *      true.
   */
  public static void setIncludeImageAlts(boolean include) {

    HTMLParser.setIncludeImageAlts(include);
  }

  /**
   * Get the set of tags to put an end of line in for. The tag text excludes the
   * closing angle bracket. eg <br or </h1
   *
   * @return The end of line tag set.
   */
  public static Set<String> getEndlineSet() {

    return HTMLParser.getEndlineSet();
  }

  /**
   * Convert a string of HTML to a text string.
   *
   * @param html The string of HTML.
   * @return The converted string or null if <code>html</code> was null.
   * @throws ParseException If there is a problem parsing the HTML.
   * @throws IOException If an IO error occurs.
   */
  public static String convert(String html) throws ParseException, IOException {

    String text = null;

    if (html != null) {

      HTMLParser parser = new HTMLParser(new StringReader(html));
      StringWriter writer = new StringWriter();

      // Create the text string
      Reader reader = parser.getReader();

      try {
        char[] buffer = new char[1024];
        int numRead = reader.read(buffer);

        while (numRead != -1) {

          writer.write(buffer, 0, numRead);
          numRead = reader.read(buffer);
        }
      } finally {
        try {
          reader.close();
        } catch (IOException e) {
          logger.warn("Error closing reader", e);
        }
      }

      writer.flush();
      text = writer.toString();
    }

    return text;
  }

  /**
   * Convert a reader with a HTML stream into a reader with a text stream.
   *
   * @param html The HTML reader.
   * @return A reader that reads the text conversion stream or null if <code>html</code>
   *      was null.
   * @throws ParseException If there is a problem parsing the HTML.
   * @throws IOException If an IO error occurs.
   */
  public static Reader convert(Reader html) throws ParseException, IOException {

    if (html != null) {

      HTMLParser parser = new HTMLParser(html);

      return parser.getReader();
    }
    else {

      return null;
    }
  }

  /**
   * Runs this converter over a whole directory of HTML files or a single HTML
   * file and prints the text conversion to standard out. <p>
   *
   * The args are either:
   * <ol>
   *   <li> -dir directory_name (convert entire directory) or</li>
   *   <li> file_name (convert a single file)</li>
   * </ol>
   * </p>
   *
   * @param args The command line args.
   * @throws Exception on error.
   */
  public static void main(String[] args) throws Exception {

    // Directory
    if ("-dir".equals(args[0])) {

      String[] files = new File(args[1]).list();
      java.util.Arrays.sort(files);

      for (int i = 0; i < files.length; i++) {

        System.err.println(files[i]);

        File file = new File(args[1], files[i]);
        parse(file);
      }
    }

    // One file
    else {

      parse(new File(args[0]));
    }
  }

  /**
   * Convert a HTML file to text and write it to standard out.
   *
   * @param file The file to convert.
   * @throws Exception on error.
   */
  private static void parse(File file) throws Exception {

    HTMLParser parser = new HTMLParser(file);
    BufferedReader reader = new BufferedReader(parser.getReader());

    try {
      for (String l = reader.readLine(); l != null; l = reader.readLine()) {

        System.out.println(l);
      }
    } finally {
      try {
        reader.close();
      } catch (IOException e) {
        System.err.println("Error closing reader");
        e.printStackTrace(System.err);
      }
    }
  }
}