// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.2
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
/**
* Contains miscellaneous utility methods not directly associated with the HTML Parser library.
*/
public final class Util {
private static final int BUFFER_SIZE=2048;
private static final String CSVNewLine=System.getProperty("line.separator");
private Util() {}
/**
* Returns the text loaded from the specified <code>Reader</code> as a string.
* <p>
* If a <code>null</code> argument is supplied to this method, an empty string is returned.
* <p>
* To load text from an <code>InputStream</code>, use <code>getString(new InputStreamReader(inputStream,encoding))</code>.
*
* @param reader the <code>java.io.Reader</code> from which to load the text.
* @return the text loaded from the specified <code>java.io.Reader</code> as a string.
* @throws java.io.IOException if an I/O error occurs.
*/
public static String getString(final Reader reader) throws IOException {
if (reader==null) return "";
try {
int charsRead;
final char[] copyBuffer=new char[BUFFER_SIZE];
final StringBuilder sb=new StringBuilder();
while ((charsRead=reader.read(copyBuffer,0,BUFFER_SIZE))!=-1)
sb.append(copyBuffer,0,charsRead);
return sb.toString();
} finally {
reader.close();
}
}
/**
* Outputs the specified array of strings to the specified <code>Writer</code> in the format of a line for a CSV file.
* <p>
* "CSV" stands for <i>Comma Separated Values</i>.
* There is no formal specification for a CSV file, so there is significant variation in
* the way different applications handle issues like the encoding of different data types and special characters.
* <p>
* Generally, a CSV file contains a list of records separated by line breaks, with each record consisting of a list of
* field values separated by commas.
* Each record in the file should contain the same number of field values, with the values at each position representing the same
* type of data in all the records. In this way the file can also be divided into columns, often with the first line of the
* file containing the column labels.
* <p>
* Columns can have different data types such as text, numeric, date / time and boolean.
* A text value is often delimited with single (<code>'</code>) or double-quotes (<code>"</code>),
* especially if the value contains a comma, line feed, or other special character that is significant to the syntax.
* Encoding techniques for including quote characters themselves in text values vary widely.
* Values of other types are generally unquoted to distinguish them from text values.
* <p>
* This method produces output that is readable by MS-Excel, conforming to the following rules:
* <p>
* <ul>
* <li>All values are considered to be of type text, except for the static constants {@link Config#ColumnValueTrue}
* and {@link Config#ColumnValueFalse}, representing the boolean values <code>true</code> and <code>false</code> respectively.
* <li>All text values are enclosed in double-quotes.
* <li>Double-quote characters contained in text values are encoded using two consecutive double-quotes (<code>""</code>).
* <li><code>null</code> values are represented as empty fields.
* <li>The end of each record is represented by a carriage-return / line-feed (CR/LF) pair.
* <li>Line breaks inside text values are represented by a single line feed (LF) character.
* </ul>
*
* @param writer the destination <code>java.io.Writer</code> for the output.
* @throws java.io.IOException if an I/O error occurs.
* @see FormFields#getColumnLabels()
* @see FormFields#getColumnValues(Map)
*/
public static void outputCSVLine(final Writer writer, final String[] values) throws IOException {
for (int i=0; i<values.length;) {
final String value=values[i];
if (value!=null) {
if (value==Config.ColumnValueTrue || value==Config.ColumnValueFalse) {
writer.write(value); // assumes neither ColumnTrue or ColumnFalse contain double quotes.
} else {
writer.write('"');
outputValueEscapeQuotes(writer,value);
writer.write('"');
}
}
if (++i!=values.length) writer.write(',');
}
writer.write(CSVNewLine);
}
private static void outputValueEscapeQuotes(final Writer writer, final String text) throws IOException {
for (int i=0; i<text.length(); i++) {
final char ch=text.charAt(i);
writer.write(ch);
if (ch=='"') writer.write(ch);
}
}
static char[] getConcatenatedCharArray(final String string1, final String string2) {
final char[] charArray=new char[string1.length()+string2.length()];
string1.getChars(0,string1.length(),charArray,0);
string2.getChars(0,string2.length(),charArray,string1.length());
return charArray;
}
}