/*
* Created on Aug 16, 2005
*/
package org.openedit.entermedia.util;
/*
* Copyright (c) Ian F. Darwin, http://www.darwinsys.com/, 1996-2002.
* All rights reserved. Software written by Ian F. Darwin and others.
* $Id: Parser.java,v 1.1 2010/05/14 21:04:13 cburkey Exp $
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Java, the Duke mascot, and all variants of Sun's Java "steaming coffee
* cup" logo are trademarks of Sun Microsystems. Sun's, and James Gosling's,
* pioneering role in inventing and promulgating (and standardizing) the Java
* language and environment is gratefully acknowledged.
*
* The pioneering role of Dennis Ritchie and Bjarne Stroustrup, of AT&T, for
* inventing predecessor languages C and C++ is also gratefully acknowledged.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import com.openedit.util.FileUtils;
/* Simple demo of CSV parser class.
*/
/** Parse comma-separated values (CSV), a common Windows file format.
* Sample input: "LU",86.25,"11/4/1998","2:19PM",+4.0625
* <p>
* Inner logic adapted from a C++ original that was
* Copyright (C) 1999 Lucent Technologies
* Excerpted from 'The Practice of Programming'
* by Brian W. Kernighan and Rob Pike.
* <p>
* Included by permission of the http://tpop.awl.com/ web site,
* which says:
* "You may use this code for any purpose, as long as you leave
* the copyright notice and book citation attached." I have done so.
* @author Brian W. Kernighan and Rob Pike (C++ original)
* @author Ian F. Darwin (translation into Java and removal of I/O)
* @author Ben Ballard (rewrote advQuoted to handle '""' and for readability)
*/
public class TabParser implements Parser {
public static final char DEFAULT_SEP = '\t';
protected BufferedReader fieldBufferedReader;
public BufferedReader getBufferedReader()
{
return fieldBufferedReader;
}
public void setBufferedReader(BufferedReader inBufferedReader)
{
fieldBufferedReader = inBufferedReader;
}
/** Construct a CSV parser, with the default separator (`,'). */
public TabParser() {
this(DEFAULT_SEP);
}
/** Construct a CSV parser with a given separator.
* @param sep The single char for the separator (not a list of
* separator characters)
*/
public TabParser(char sep) {
fieldSep = sep;
tabs = Pattern.compile(String.valueOf(sep));
}
/** The fields in the current String */
protected List list = new ArrayList();
protected Pattern tabs;
/** the separator char for this parser */
protected char fieldSep;
public String[] parseRegEx(String line)
{
//list.clear();
String[] args = tabs.split(line);
//list.addAll(Arrays.asList(args));
return args;
}
/** parse: break the input String into fields
* @return java.util.Iterator containing each field
* from the original as a String, in order.
*/
public List parse(String line)
{
StringBuffer sb = new StringBuffer();
list.clear(); // recycle to initial state
int i = 0;
if (line.length() == 0) {
list.add(line);
return list;
}
do {
sb.setLength(0);
if (i < line.length() && line.charAt(i) == '"')
i = advQuoted(line, sb, ++i); // skip quote
else
i = advPlain(line, sb, i);
list.add(sb.toString());
i++;
} while (i < line.length());
if (line.charAt(line.length()-1) == fieldSep)
list.add("");
return list;
}
/** advQuoted: quoted field; return index of next separator */
protected int advQuoted(String s, StringBuffer sb, int i)
{
int j;
int len= s.length();
for (j=i; j<len; j++) {
if (s.charAt(j) == '"' && j+1 < len) {
if (s.charAt(j+1) == '"') {
j++; // skip escape char
} else if (s.charAt(j+1) == fieldSep) { //next delimeter
j++; // skip end quotes
break;
}
} else if (s.charAt(j) == '"' && j+1 == len) { // end quotes at end of line
break; //done
}
sb.append(s.charAt(j)); // regular character.
}
return j;
}
/** advPlain: unquoted field; return index of next separator */
protected int advPlain(String s, StringBuffer sb, int i)
{
int j;
j = s.indexOf(fieldSep, i); // look for separator
if (j == -1) { // none found
sb.append(s.substring(i));
return s.length();
} else {
sb.append(s.substring(i, j));
return j;
}
}
public String[] readNext()
{
try
{
String line = getBufferedReader().readLine();
if( line == null)
{
return null;
}
line = line.replaceAll("\u001e"," , ");
//Only keep valid ASCII text
StringBuffer escapedSource = new StringBuffer(line.length());
//String zeros = "000000";
for ( int n = 0; n < line.length(); n++ )
{
char c = line.charAt( n );
if ( c > 31 && c < 127 )
{
escapedSource.append( c );
}
if ( c == '\t')
{
escapedSource.append( c );
}
else
{
//skip ISO just 32 - 126
}
}
line = escapedSource.toString();
List items = parse(line);
return (String[])items.toArray(new String[items.size()]);
}
catch( IOException ex)
{
throw new RuntimeException(ex);
}
}
public void close()
{
FileUtils.safeClose( getBufferedReader() );
}
}