package gr.ntua.ivml.mint.xml;
import gr.ntua.ivml.mint.db.AsyncNodeStore;
import gr.ntua.ivml.mint.db.DB;
import gr.ntua.ivml.mint.persistent.DataUpload;
import gr.ntua.ivml.mint.persistent.DataUpload.EntryProcessor;
import gr.ntua.ivml.mint.persistent.ReportI;
import gr.ntua.ivml.mint.persistent.XMLNode;
import gr.ntua.ivml.mint.persistent.XmlObject;
import gr.ntua.ivml.mint.persistent.XpathHolder;
import gr.ntua.ivml.mint.util.CSVParser;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.sql.Connection;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Reads the Csv Stream and prepares XMLNodes for storing.
* @author Arne Stabenau
*
*/
public class CsvToXmlReader {
private static final Logger log = Logger.getLogger( CsvToXmlReader.class );
Connection c;
boolean hasHeader;
char delimiter, escChar;
DataUpload du;
XmlObject xml;
XpathHolder root;
AsyncNodeStore ans;
private long currentNodeNumber=-1l;
private long[] nodeNumbers;
private long nodeCount = 0l;
private int itemCount=0;
/**
* The quote is fixed to '"'. Delimiter is free, so is escChar
* @param hasHeader
* @param delimiter
* @param escChar
* @throws Exception
*/
public CsvToXmlReader( DataUpload du, boolean hasHeader, String delimiter, String escChar ) throws Exception {
this.du = du;
this.hasHeader = hasHeader;
this.delimiter = (delimiter!=null)?delimiter.charAt(0):'\0';
this.escChar = (escChar!=null)?escChar.charAt(0):'\0';
}
public void parse() throws Exception {
c = DB.getStatelessSession().connection();
try {
root = new XpathHolder();
root.name = "";
root.parent = null;
root.xpath = "";
xml = new XmlObject();
root.xmlObject = xml;
DB.getXmlObjectDAO().makePersistent(xml);
ans = new AsyncNodeStore( xml );
EntryProcessor ep = new EntryProcessor( ) {
public void processEntry(de.schlichtherle.util.zip.ZipEntry ze, InputStream is) throws Exception {
if( ze.isDirectory()) return;
String entryName = ze.getName();
if( !( entryName.endsWith(".csv") || entryName.endsWith(".txt"))) return;
// makes this process interruptible
Thread.sleep(0);
InputSource ins = new InputSource();
ins.setByteStream(is);
CSVParser parser = new CSVParser( delimiter, '\"', escChar);
BufferedReader br = new BufferedReader( new InputStreamReader( is, "UTF8" ));
parseEntry( parser, br );
}
};
du.processAllEntries(ep);
DB.commit();
DB.getSession().clear();
ans.finish();
DB.getSession().refresh(du);
du.setNodeCount(nodeCount);
du.setXmlObject(xml);
du.setMessage("Uploaded " + itemCount + " items." );
du.setStatus(DataUpload.OK);
du.setItemXpath(xml.getByPathWithPrefix("/items/item", true));
DB.getDataUploadDAO().makePersistent(du);
} catch( Exception e ) {
if( du.getStatus() != DataUpload.ERROR ) {
du.setStatus(DataUpload.ERROR);
du.setMessage( e.getMessage() );
DB.commit();
}
log.error( "Problem during csv parsing", e );
ans.abort();
// rollback somehow .... lots of commits already in ..
DB.getXmlObjectDAO().makeTransient(xml);
DB.getSession().clear();
throw e;
}
DB.commit();
}
/**
* Does the Job
* Pseudo xml is <records> <items> and then either
* <field_1> <field_2> ...
* or from the headers
* <header1> <header2>
*/
private void parseEntry( CSVParser parser, BufferedReader reader ) throws Exception {
XMLNode records = null;
String[] header = null;
if( hasHeader ) {
header = readNext( parser, reader );
if(( header == null ) || ( header.length == 0 )) throw new Exception( "No header found" );
}
String[] tokens = readNext( parser, reader );
if(( tokens != null ) && ( tokens.length != 0 )) {
records = new XMLNode(newNodeId());
records.nodeType = XMLNode.ELEMENT;
records.size = 1;
records.setXpathHolder(root.getByNameUri("items", "" ));
records.parentNodeId = 0l;
records.xmlObject = xml;
}
while( tokens != null ) {
if(( header != null ) && (tokens.length != header.length)) {
throw new Exception( "Header and row have different length" );
}
XMLNode item = newChild( records, "item", null );
for( int i=0; i<tokens.length; i++ ) {
String tagname = "Field_"+(i+1);
if( header != null ) {
tagname = escTagname( header[i]);
}
if(( tokens[i] != null ) && (tokens[i].length()>0)) {
XMLNode field = newChild( item, tagname, null );
XMLNode text = newChild( field, "text()", tokens[i]);
field.size = 2;
item.size+=2;
store( text );
store( field );
}
}
store( item );
itemCount++;
records.size += item.size;
tokens = readNext( parser, reader );
}
store( records );
}
/**
* Simplified node creation for this special case.
*/
private XMLNode newChild( XMLNode parent, String tagname, String content ) throws Exception {
XMLNode result = new XMLNode( newNodeId());
if( content == null )
result.nodeType = XMLNode.ELEMENT;
else {
result.nodeType = XMLNode.TEXT;
result.content = content;
}
result.setXmlObject(parent.getXmlObject());
result.setXpathHolder(parent.getXpathHolder().getByNameUri(tagname, ""));
result.size = 1;
result.parentNodeId = parent.getNodeId();
return result;
}
/**
* Reads the next line from the buffer and converts to a string array.
* Allow for empty lines.
* Want to allow for comment lines as well...
*
* @return a string array with each comma-separated element as a separate
* entry.
*
* @throws Exception
* if bad things happen during the read
*/
private String[] readNext( CSVParser parser, BufferedReader reader ) throws Exception {
String[] result = null;
do {
String nextLine;
// skip empty lines if they are there
do {
nextLine = reader.readLine();
if( nextLine == null ) break;
if( parser.isPending() ) break;
} while( nextLine.trim().length() == 0 );
if( nextLine == null ) {
if( parser.isPending()) throw new Exception( "Quotes not matching, missing input!");
else return null;
}
// skip empty lines if we are not pending
String[] r = parser.parseLineMulti(nextLine);
if (r.length > 0) {
if (result == null) {
result = r;
} else {
String[] t = new String[result.length+r.length];
System.arraycopy(result, 0, t, 0, result.length);
System.arraycopy(r, 0, t, result.length, r.length);
result = t;
}
}
} while (parser.isPending());
return result;
}
/**
* Get ids from the db and give them out on request.
* @return
* @throws SAXException
*/
private long newNodeId() throws SAXException {
if(( currentNodeNumber == -1l) || ( currentNodeNumber == nodeNumbers[1])) {
// need to get new nodenumbers
nodeNumbers = AsyncNodeStore.getIds(c);
if( nodeNumbers[0] < 0 ) {
throw new SAXException( "Couldnt aquire node ids from DB");
} else {
currentNodeNumber = nodeNumbers[0];
}
} else {
currentNodeNumber++;
}
return currentNodeNumber;
}
private void store( XMLNode n ) throws Exception {
if( n.getXpathHolder() != null ) {
if( !DB.getSession().contains(n.getXpathHolder())) {
DB.getSession().save( n.getXpathHolder());
// commit not needed to get dbID of pathHolder
// DB.commit();
}
if( n.getXpathHolder().getDbID() == null )
log.warn( "XpathHolder with no id!!");
} else {
log.warn( "No xpath Holder!!!");
}
// store the node asynchronous from reading, multithreading ...
ans.store(n);
nodeCount++;
}
private String escTagname( String name ) {
StringBuilder sb = new StringBuilder();
for( int i=0; i<name.length(); i++ ) {
boolean append = false;
char current = name.charAt(i);
if( Character.isLetter( current )) append = true;
else if( i>0 ) {
append = Character.isDigit(current) ||
( current == '-' ) ||
( current == '.' ) ||
( current == '_' );
}
if( append ) sb.append( current );
else sb.append( "_" );
}
return sb.toString();
}
}