/*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is the Kowari Metadata Store.
*
* The Initial Developer of the Original Code is Plugged In Software Pty
* Ltd (http://www.pisoftware.com, mailto:info@pisoftware.com). Portions
* created by Plugged In Software Pty Ltd are Copyright (C) 2001,2002
* Plugged In Software Pty Ltd. All Rights Reserved.
*
* Contributor(s): N/A.
*
* [NOTE: The text of this Exhibit A may differ slightly from the text
* of the notices in the Source Code files of the Original Code. You
* should use the text of this Exhibit A rather than the text found in the
* Original Code Source Code for Your Modifications.]
*
*/
package org.mulgara.util.conversion.html;
// Java 2 standard packages
import java.io.*;
import java.util.*;
import org.apache.log4j.Logger;
/**
* Converts HTML text into unicode text. <p>
*
* The method {@link #convert(String)} can be used to replace non-breaking
* spaces with normal spaces if the unicode value is not acceptable. </p> <p>
*
* The end of line characters can be set using the System property <code>line.separator</code>
* . The tags for putting end of lines in at can be set using {@link
* #getEndlineSet()} and {@link #setEndlineSet(Set)}. </p> <p>
*
* Whether or not titles and image alt tags are included in the output can be
* set using {@link #setIncludeTitle(boolean)} and {@link
* #setIncludeImageAlts(boolean)}. </p>
*
* @created 2002-08-01
*
* @author Ben Warren
*
* @version $Revision: 1.9 $
*
* @modified $Date: 2005/01/05 04:59:30 $
*
* @maintenanceAuthor $Author: newmana $
*
* @company <A href="mailto:info@PIsoftware.com">Plugged In Software</A>
*
* @copyright ©2002 <a href="http://www.pisoftware.com/">Plugged In
* Software Pty Ltd</a>
*
* @licence <a href="{@docRoot}/../../LICENCE">Mozilla Public License v1.1</a>
*/
public class HtmlToTextConverter {
private static final Logger logger = Logger.getLogger(HtmlToTextConverter.class);
/**
* Private constructor to stop instanciation.
*/
private HtmlToTextConverter() {
}
/**
* Set whether or not normal spaces will replace non-breaking space entities.
*
* @param value If true normal spaces will replace non-braking spaces
* otherwise non-breaking spaces will be used.
*/
public static void setUseNormalSpace(boolean value) {
// Normal space
if (value) {
Entities.add(" ", 32);
}
// Non-breaking
else {
Entities.add(" ", 160);
}
}
/**
* Set the set of tags to put an end of line in for. The tag text excludes the
* closing angle bracket. eg <br or </h1
*
* @param set The end of line tag set.
*/
public static void setEndlineSet(Set<String> set) {
HTMLParser.setEndlineSet(set);
}
/**
* Set if the title should be included in the text output.
*
* @param include The title will be included if true. Defaults to true.
*/
public static void setIncludeTitle(boolean include) {
HTMLParser.setIncludeTitle(include);
}
/**
* Set if image alt tags should be included in the text output.
*
* @param include The image alt tags will be included if true. Defaults to
* true.
*/
public static void setIncludeImageAlts(boolean include) {
HTMLParser.setIncludeImageAlts(include);
}
/**
* Get the set of tags to put an end of line in for. The tag text excludes the
* closing angle bracket. eg <br or </h1
*
* @return The end of line tag set.
*/
public static Set<String> getEndlineSet() {
return HTMLParser.getEndlineSet();
}
/**
* Convert a string of HTML to a text string.
*
* @param html The string of HTML.
* @return The converted string or null if <code>html</code> was null.
* @throws ParseException If there is a problem parsing the HTML.
* @throws IOException If an IO error occurs.
*/
public static String convert(String html) throws ParseException, IOException {
String text = null;
if (html != null) {
HTMLParser parser = new HTMLParser(new StringReader(html));
StringWriter writer = new StringWriter();
// Create the text string
Reader reader = parser.getReader();
try {
char[] buffer = new char[1024];
int numRead = reader.read(buffer);
while (numRead != -1) {
writer.write(buffer, 0, numRead);
numRead = reader.read(buffer);
}
} finally {
try {
reader.close();
} catch (IOException e) {
logger.warn("Error closing reader", e);
}
}
writer.flush();
text = writer.toString();
}
return text;
}
/**
* Convert a reader with a HTML stream into a reader with a text stream.
*
* @param html The HTML reader.
* @return A reader that reads the text conversion stream or null if <code>html</code>
* was null.
* @throws ParseException If there is a problem parsing the HTML.
* @throws IOException If an IO error occurs.
*/
public static Reader convert(Reader html) throws ParseException, IOException {
if (html != null) {
HTMLParser parser = new HTMLParser(html);
return parser.getReader();
}
else {
return null;
}
}
/**
* Runs this converter over a whole directory of HTML files or a single HTML
* file and prints the text conversion to standard out. <p>
*
* The args are either:
* <ol>
* <li> -dir directory_name (convert entire directory) or</li>
* <li> file_name (convert a single file)</li>
* </ol>
* </p>
*
* @param args The command line args.
* @throws Exception on error.
*/
public static void main(String[] args) throws Exception {
// Directory
if ("-dir".equals(args[0])) {
String[] files = new File(args[1]).list();
java.util.Arrays.sort(files);
for (int i = 0; i < files.length; i++) {
System.err.println(files[i]);
File file = new File(args[1], files[i]);
parse(file);
}
}
// One file
else {
parse(new File(args[0]));
}
}
/**
* Convert a HTML file to text and write it to standard out.
*
* @param file The file to convert.
* @throws Exception on error.
*/
private static void parse(File file) throws Exception {
HTMLParser parser = new HTMLParser(file);
BufferedReader reader = new BufferedReader(parser.getReader());
try {
for (String l = reader.readLine(); l != null; l = reader.readLine()) {
System.out.println(l);
}
} finally {
try {
reader.close();
} catch (IOException e) {
System.err.println("Error closing reader");
e.printStackTrace(System.err);
}
}
}
}