package gr.ntua.ivml.mint.concurrent;
import gr.ntua.ivml.mint.db.AsyncNodeStore;
import gr.ntua.ivml.mint.db.DB;
import gr.ntua.ivml.mint.harvesting.SingleHarvester;
import gr.ntua.ivml.mint.persistent.DataUpload;
import gr.ntua.ivml.mint.persistent.ReportI;
import gr.ntua.ivml.mint.persistent.Transformation;
import gr.ntua.ivml.mint.persistent.XmlSchema;
import gr.ntua.ivml.mint.persistent.XpathHolder;
import gr.ntua.ivml.mint.util.Config;
import gr.ntua.ivml.mint.util.NodeReader;
import gr.ntua.ivml.mint.util.StringUtils;
import gr.ntua.ivml.mint.xml.CsvToXmlReader;
import gr.ntua.ivml.mint.xsd.SchemaValidator;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Enumeration;
import java.util.zip.ZipException;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.io.IOUtils;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.log4j.Logger;
import de.schlichtherle.io.FileInputStream;
import de.schlichtherle.util.zip.ZipEntry;
import de.schlichtherle.util.zip.ZipFile;
/**
* UploadIndexer accompanies an upload from the moment the user initiates it
* @author Arne Stabenau
*
*/
public class UploadIndexer implements Runnable, ReportI {
public static final int FTPSERVER = 1;
public static final int URLUPLOAD = 2;
public static final int OAIHARVEST = 3;
public static final int SERVERFILE = 4;
public static final int HTTPUPLOAD = 5;
DataUpload du;
int method;
public String filename;
public File tmpFile;
public String set;
public String ns;
public Date from;
public Date to;
public boolean isCsv=false, hasHeader;
public String csvDelimiter, csvEscCharacter;
// the indexer has a immediate phase and a queued phase now
// all heavy db stuff is happening in the queued phase
public boolean preQueue;
public static final Logger log = Logger.getLogger(UploadIndexer.class );
public UploadIndexer( DataUpload du, int method ) {
this.method = method;
this.du = du;
this.preQueue = true;
}
public UploadIndexer(DataUpload du, int method, String set, String ns, Date from, Date to){
this.method = method;
this.du = du;
this.set = set;
this.ns = ns;
this.from = from;
this.to = to;
this.preQueue = true;
}
public DataUpload getDataUpload() {
return du;
}
public void setServerFile( String filename ) {
this.filename = filename;
this.tmpFile = new File( filename );
}
/**
* Depending on the method and the file,
* different things are done.
* The UploadIndexer needs to run twice. First in a preQueue phase, then
* in a queue to do the database heavy parts. Enqueueing happens
* once the data is as blob in the database.
*
*/
public void run() {
// http uploads are on file system already
// ftp downloads need transfer from ftp server
// url uploads need to be downloaded
// oai harvests need to harvest ...
// need to start transaction
DB.newSession();
DB.getSession().beginTransaction();
du = DB.getDataUploadDAO().findById(du.getDbID(), false);
DB.logPid();
try {
if( preQueue ) {
check();
aquire();
schemaCheck();
check();
upload();
check();
du.setStatus(DataUpload.QUEUED );
} else {
if( isCsv )
parseCsv();
else
parseXml();
check();
AsyncNodeStore.index(du.getXmlObject(), this, DB.getStatelessSession().connection());
if(du.getNodeCount()==0){
throw new Exception("No nodes stored.");
}
du.updateStatus(DataUpload.OK);
du.setMessage(du.getNodeCount()+ " nodes imported and indexed" );
store();
if(du.isDirect()){
setSchemaLevelLabel(du.getDirectSchema());
schemaTransform();
}
}
} catch( InterruptedException e ) {
log.info( "UploadIndexer interrupted, data will become invalid!" );
preQueue = false;
} catch( Exception e2 ) {
log.error( "UploadIndexer failed on DataUpload " + du.getDbID(), e2 );
if( du.getStatus() != DataUpload.ERROR) {
du.setMessage("Upload indexer failed on Upload " + du.getDbID() + " with: \n"+e2.getMessage());
du.setStatus(DataUpload.ERROR);
}
store();
preQueue = false;
} finally {
try {
DB.getSession().getTransaction().commit();
} catch( Exception e2 ) {
log.error( "Transaction cannot be commited!", e2 );
}
DB.closeSession();
DB.closeStatelessSession();
}
if( preQueue ) {
// requeue this job on for the indexing job
preQueue= false;
Queues.queue(this, "db");
}
}
private void setSchemaLevelLabel(XmlSchema schema) {
String xpath = schema.getItemLevelPath();
if(!StringUtils.empty(xpath)) {
log.debug("item level: " + xpath);
XpathHolder xplvl = du.getXmlObject().getRoot().getByRelativePath(xpath);
du.setItemXpath(xplvl);
String label= schema.getItemLabelPath();
if( ! StringUtils.empty( label )) {
log.debug("item label: " + label);
XpathHolder xplbl = du.getXmlObject().getRoot().getByRelativePath(label);
du.setItemLabelXpath(xplbl);
}
}
DB.getDataUploadDAO().makePersistent(du);
}
private void schemaTransform(){
Transformation t = new Transformation();
t.setDataUpload(du);
t.setParsedOutput( du.getXmlObject());
//set Mapping for this tranformation the hardcoded LidoToLido in db with id=1
//Mapping lido2lido=DB.getMappingDAO().getById(new Long(1), false);
//t.setMapping( some fake mapping );
//t.setMapping(lido2lido);
t.setZippedOutput( du.getData() );
t.setStatusCode(0);
t.setJsonMapping("");
t.setUser(du.getUploader());
DB.getTransformationDAO().makePersistent(t);
}
/**
* Check if the upload complies with declared schema (if any)
* and throw if thats not the case.
* @throws Exception
*/
private void schemaCheck() throws Exception {
if(du.isDirect()){
final XmlSchema schema = du.getDirectSchema();
try {
ZipFile zf = new ZipFile( tmpFile );
Enumeration<ZipEntry> e = zf.entries();
while( e.hasMoreElements() ) {
ZipEntry ze = e.nextElement();
InputStream is = zf.getInputStream(ze);
if( ze.isDirectory()) continue;
String entryName = ze.getName();
if( !entryName.endsWith(".xml") && !entryName.endsWith(".XML")) continue;
Thread.sleep(0);
try {
BufferedInputStream bis = new BufferedInputStream( is );
Source source = new StreamSource(bis);
SchemaValidator.validate( source, schema );
} catch( Exception ex ) {
log.debug( "Schema validate failed on " + entryName, ex );
throw new Exception( "Entry " + entryName + " failed Schema validation! \n" + ex.getMessage());
} finally {
if( is != null ) is.close();
}
}
} catch( ZipException ze ) {
// maybe its just one file
InputStream is = null;
try {
is = new BufferedInputStream( new FileInputStream(tmpFile));
Source source = new StreamSource(is);
SchemaValidator.validate( source, schema );
} catch( Exception ex ) {
log.debug( "Schema validate failed on " + du.getOriginalFilename(), ex );
throw new Exception( du.getOriginalFilename() + " failed Schema validation! \n" + ex.getMessage());
} finally {
if( is != null ) is.close();
}
}
}
}
/**
* Get the data into the filesystem.
* @return
*/
private void aquire() throws Exception {
if( method == FTPSERVER )
aquireFtp();
else if( method == URLUPLOAD)
aquireUrl();
else if( method == OAIHARVEST )
aquireOAI();
}
/**
* Harvests from DataUpload url. Result in DataUpload.tmpFile at the end,
* so ready for upload.
* @throws Exception
*/
private void aquireOAI() throws Exception {
du.updateStatus(DataUpload.HARVEST);
this.store();
SingleHarvester harvester = null;
if( ((this.from == null) || (this.to == null)) && (this.set == null) ){
harvester = new SingleHarvester(du.getSourceURL(), null, null, this.ns, null);
}else if((this.from == null) || (this.to == null)){
harvester = new SingleHarvester(du.getSourceURL(), null, null, this.ns, this.set);
}else{
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
harvester = new SingleHarvester(du.getSourceURL(), format.format(this.from), format.format(this.to), this.ns, this.set);
}
harvester.setReporter(this);
du.setOriginalFilename(du.getSourceURL());
this.store();
//this.tmpFile = new File(du.getOriginalFilename());
//log.error("filename:"+ du.getOriginalFilename());
try{
harvester.harvest();
this.tmpFile = new File(harvester.getFileName());
}catch(Exception e){
log.error("oai error:", e);
du.setMessage("oai harvesting encountered an error:" + e.getMessage());
du.updateStatus(DataUpload.ERROR);
store();
throw e;
}
}
/**
* Given a DataUpload, retrieve file from ftp server.
* Store it on du.tmpFile.
* into
*/
private void aquireFtp() throws Exception {
try {
du.updateStatus(DataUpload.HARVEST);
store();
tmpFile = File.createTempFile("MintFtp", "" );
FileOutputStream fos = new FileOutputStream( tmpFile );
FTPClient f= new FTPClient();
f.connect(Config.get("ftp.host"));
f.login(Config.get("ftp.user"), Config.get("ftp.password"));
f.setFileType(FTP.BINARY_FILE_TYPE);
if( ! f.retrieveFile(du.getOriginalFilename(), fos )) {
log.error( "There was a problem retreiving file");
throw new Exception( "Retrieve failed ");
}
fos.close();
log.info( "FTPed " + filename + " with " + tmpFile.length() + " bytes.");
} catch( Exception e ) {
log.error( "FTP file retreive or storing didnt succeed", e );
du.setMessage("FTP retrieve failed: " + e.getMessage());
du.updateStatus(DataUpload.ERROR);
store();
throw e;
}
}
private void aquireUrl() throws Exception {
try {
du.updateStatus(DataUpload.HARVEST);
store();
tmpFile = File.createTempFile("MintUrl", "" );
InputStream is = new URL( du.getSourceURL()).openStream();
FileOutputStream fos = new FileOutputStream( tmpFile );
IOUtils.copy( is, fos);
is.close();
fos.flush();
fos.close();
} catch( Exception e ) {
log.error( "URL download failed", e );
du.setMessage("URL download failed: " + e.getMessage());
store();
throw e;
}
}
/**
* Move data into the BLOB
* @return
*/
private void upload() throws Exception {
du.setMessage("" );
du.updateStatus(DataUpload.UPLOAD);
store();
du.upload(tmpFile);
store();
// get a clean du
DB.getSession().clear();
du = DB.getDataUploadDAO().findById(du.getDbID(), false);
log.info( "Delete " + tmpFile.getName());
if( method != SERVERFILE )
tmpFile.delete();
}
private void parseXml() throws Exception {
du.setMessage("" );
du.updateStatus(DataUpload.PARSE);
store();
NodeReader nr = new NodeReader( du );
nr.readNodes();
DB.getSession().clear();
du = DB.getDataUploadDAO().findById(du.getDbID(), false);
}
/**
* Dump the upload and parse .txt and .csv files into
* pseudo xml and XML object.
* @throws Exception
*/
private void parseCsv() throws Exception {
du.setMessage("" );
du.updateStatus(DataUpload.PARSE);
store();
CsvToXmlReader csv = new CsvToXmlReader(du, hasHeader, csvDelimiter, csvEscCharacter);
csv.parse();
DB.getSession().clear();
du = DB.getDataUploadDAO().findById(du.getDbID(), false);
}
/**
* shortcut for typing, should be inlined
*/
private final void store() {
DB.commit();
}
/**
* Make the thread more interruptible
* same as sleep(0) ?
* @throws Exception
*/
private final void check() throws InterruptedException {
if( Thread.currentThread().isInterrupted())
throw new InterruptedException( "Thread interrupted!" );
}
private void doSQL( Connection c, String sql ) throws SQLException {
Statement st;
st = c.createStatement();
st.executeUpdate( sql );
st.close();
c.commit();
}
@Override
public void report(String msg) {
du.setMessage(msg);
DB.commit();
}
@Override
public void reportError() {
du.setStatus(DataUpload.ERROR);
}
}