package org.basex.build.file;
import static org.basex.util.Token.*;
import static org.basex.core.Text.*;
import java.io.IOException;
import java.util.Locale;
import org.basex.build.SingleParser;
import org.basex.core.BaseXException;
import org.basex.core.Prop;
import org.basex.io.IO;
import org.basex.io.in.NewlineInput;
import org.basex.util.TokenBuilder;
import org.basex.util.XMLToken;
import org.basex.util.list.TokenList;
/**
* This class parses files in the CSV format
* and sends events to the specified database builder.
*
* <p>The parser provides some options, which can be specified via
* <code>SET PARSEROPT ...</code>:</p>
*
* <ul>
* <li><code>separator</code> defines the column separator, which can be
* <code>comma</code>, <code>semicolon</code>, or <code>tab</code>
* (default: <code>comma</code>).</li>
* <li><code>header</code> specifies if the input file contains a header.
* Can be set to <code>yes</code> or <code>no</code>
* (default: <code>yes</code>)</li>
* <li><code>format</code> specifies the XML format, which can be
* <code>simple</code> or <code>verbose</code>
* (default: <code>verbose</code>).</li>
* </ul>
*
* <p>All options are separated by commas, and the keys and values are
* separated by equality sign (=).</p>
*
* <p><b>Example</b>:
* <code>SET PARSEROPT separator=tab,format=simple,header=no; CREATE DB ...
* </code><br/>
* <b>Description</b>: Use tabs as separator, choose simple XML format,
* and indicate that the file contains no header.</p>
*
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
*/
public final class CSVParser extends SingleParser {
/** Separators. */
public static final String[] SEPARATORS = { "comma", "semicolon", "tab" };
/** Formats. */
public static final String[] FORMATS = { "simple", "verbose" };
/** CSV root element. */
private static final byte[] CSV = token("csv");
/** CSV header element. */
private static final byte[] HEADER = token("header");
/** CSV record element. */
private static final byte[] RECORD = token("record");
/** CSV field element. */
private static final byte[] ENTRY = token("entry");
/** CSV column attribute. */
private static final byte[] COLUMN = token("col");
/** Column separator (see {@link ParserProp#SEPARATOR}). */
private final int separator;
/** Headers. */
private final TokenList headers = new TokenList();
/** Simple format. */
private final boolean simple;
/** Encoding. */
private final String encoding;
/** Current row. */
private int row;
/** Current column. */
private int col;
/**
* Constructor.
* @param source document source
* @param target target path
* @param prop database properties
* @throws IOException I/O exception
*/
public CSVParser(final IO source, final String target, final Prop prop)
throws IOException {
super(source, target);
// set parser properties
final ParserProp props = new ParserProp(prop.get(Prop.PARSEROPT));
row = props.is(ParserProp.HEADER) ? 0 : 1;
// set separator
String s = props.get(ParserProp.SEPARATOR).toLowerCase(Locale.ENGLISH);
separator = s.equals(SEPARATORS[0]) ? ',' : s.equals(SEPARATORS[1]) ? ';' :
s.equals(SEPARATORS[2]) ? '\t' : -1;
if(separator == -1) throw new BaseXException(
INVALID_VALUE_X_X, ParserProp.SEPARATOR[0], s);
// set XML format
s = props.get(ParserProp.FORMAT).toLowerCase(Locale.ENGLISH);
simple = s.equals(FORMATS[0]);
if(!simple && !s.equals(FORMATS[1])) throw new BaseXException(
INVALID_VALUE_X_X, ParserProp.FORMAT[0], s);
encoding = props.get(ParserProp.ENCODING);
}
@Override
public void parse() throws IOException {
builder.startElem(CSV, atts);
final TokenBuilder tb = new TokenBuilder();
final NewlineInput nli = new NewlineInput(src, encoding);
boolean quoted = false, open = true;
int ch = -1;
while(true) {
if(ch == -1) ch = nli.read();
if(ch == -1) break;
if(quoted) {
if(ch == '"') {
ch = nli.read();
if(ch != '"') {
quoted = false;
continue;
}
}
tb.add(ch);
} else if(ch == separator) {
if(open) {
open();
open = false;
}
add(tb);
} else if(ch == '\n') {
finish(tb, open);
open = true;
} else if(ch == '"') {
quoted = true;
} else {
tb.add(XMLToken.valid(ch) ? ch : '?');
}
ch = -1;
}
nli.close();
finish(tb, open);
builder.endElem();
}
/**
* Opens a new record.
* @throws IOException I/O exception
*/
private void open() throws IOException {
if(row == 0) {
if(simple) builder.startElem(HEADER, atts);
} else {
builder.startElem(RECORD, atts);
}
}
/**
* Finishes the current record.
* @param tb token builder
* @param open open flag
* @throws IOException I/O exception
*/
private void finish(final TokenBuilder tb, final boolean open)
throws IOException {
boolean close = !open;
if(open && tb.size() != 0) {
open();
close = true;
}
add(tb);
if(close) {
if(simple || row != 0) builder.endElem();
++row;
}
col = 0;
}
/**
* Adds a field.
* @param tb token builder
* @throws IOException I/O exception
*/
private void add(final TokenBuilder tb) throws IOException {
if(row == 0 && !simple) {
addHeader(tb.finish());
tb.reset();
return;
}
final byte[] t;
if(simple) {
t = ENTRY;
} else {
if(col == headers.size()) addHeader(COLUMN);
t = headers.get(col);
}
if(tb.size() != 0 || simple) {
builder.startElem(t, atts);
builder.text(tb.finish());
builder.endElem();
tb.reset();
}
++col;
}
/**
* Adds a field header.
* @param f field name
*/
private void addHeader(final byte[] f) {
// create tag name
final TokenBuilder nm = new TokenBuilder();
for(int p = 0; p < f.length; p += cl(f, p)) {
final int cp = cp(f, p);
nm.add((p == 0 ? XMLToken.isNCStartChar(cp) :
XMLToken.isNCChar(cp)) ? cp : '_');
}
// no valid characters found: add default column name
if(nm.size() == 0) nm.add(COLUMN);
// tag exists: attach enumerator
byte[] fb = nm.finish();
if(headers.contains(fb)) {
int c = 2;
do {
fb = concat(nm.finish(), token(c++));
} while(headers.contains(fb));
}
// add header
headers.add(fb);
}
}