package gr.ntua.ivml.athena.persistent; import gr.ntua.ivml.athena.db.DB; import gr.ntua.ivml.athena.mapping.MappingElement; import gr.ntua.ivml.athena.util.TraversableI; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.StringWriter; import java.sql.SQLException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.HashMap; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; import java.util.zip.ZipOutputStream; import org.apache.commons.lang.CharSet; import org.apache.commons.lang.CharSetUtils; import org.apache.log4j.Logger; import org.hibernate.Hibernate; import de.schlichtherle.util.zip.BasicZipFile; public class DataUpload implements Lockable { public interface EntryProcessor { public void processEntry( de.schlichtherle.util.zip.ZipEntry ze, InputStream is ) throws Exception ; } private static final Logger log = Logger.getLogger( DataUpload.class ); public static final int OK = 0; public static final int HARVEST = 1; public static final int UPLOAD = 2; public static final int PARSE = 3; public static final int INDEX = 4; public static final int SCHEDULED = 5; public static final int ERROR = -1; public static final int QUEUED = 6; private static HashMap<Integer,String> statusText = new HashMap<Integer, String>(); static { statusText.put( -1, "ERROR"); statusText.put( 0, "OK"); statusText.put( 1, "HARVEST"); statusText.put( 2, "UPLOAD"); statusText.put( 3, "PARSE"); statusText.put( 4, "INDEX"); statusText.put( 5, "SCHEDULED"); } Long dbID; Date uploadDate; Organization organization; User uploader; int noOfFiles; String sourceURL; String originalFilename; String message; // resumption token for oai String resumptionToken; // see defined ints above int status; // xml or excel, maybe other formats in the future String structuralFormat; boolean httpUpload; boolean adminUpload; boolean oaiHarvest; boolean zippedUpload; long uploadSize; long nodeCount; Mapping jsonMapping; String xsl; BlobWrap data; public XpathHolder itemXpath; public XpathHolder itemLabelXpath; String schemaName; // represents the parsed xml in the database XmlObject xmlObject; public XmlObject getXmlObject() { return xmlObject; } public void setXmlObject(XmlObject xmlObject) { this.xmlObject = xmlObject; } // some transient stuff, not going to DB // tmpZip is exactly what is in the blob, so if it exists, // use tmpZip instead of blob private File tmpZip; private boolean unloaded = false; private XpathHolder rootXpath; public Mapping getJsonMapping() { return jsonMapping; } public void setJsonMapping(Mapping jsonMapping) { this.jsonMapping = jsonMapping; } public String getXsl() { return xsl; } public void setXsl(String xsl) { this.xsl = xsl; } public XpathHolder getRootXpath() { if( rootXpath == null ) { rootXpath=DB.getXpathHolderDAO().getRoot( this ); } return rootXpath; } /** * Use this function to create the DataUpload object. Provide * a file with the uploaded data. Set the originalFilename * later on the created object. As well, please provide * structural Format and whether this was an httpUpload or an * adminUpload * * @param data * @param uploader * @return */ public static DataUpload create( File data, User uploader, String originalFilename ) throws IOException { DataUpload upload = new DataUpload(); upload.uploader = uploader; upload.originalFilename = originalFilename; upload.uploadDate = new Date(); upload.organization = uploader.getOrganization(); upload.uploadSize = data.length(); int entryCount; if( data.isDirectory()) { initFromDir( upload, data ); log.debug( "Dir upload into zipped BLOB" ); } else if(( entryCount = countZipEntries( data )) <0 ) { initFromSimpleFile( upload, data ); log.debug( "Simple file upload"); } else { // its a zip, direct upload possible upload.noOfFiles = entryCount; log.debug( "Zipped file upload - " + entryCount + " entries" ); upload.zippedUpload = true; FileInputStream fis = new FileInputStream( data ); upload.data = new BlobWrap(); upload.data.setData(Hibernate.createBlob(fis, (int) data.length())); } return upload; } /** * Call this method when you already have the DataUpload object without data * * @param data */ public void upload( File data ) throws Exception { try { int entryCount; if( data.isDirectory()) { initFromDir( this, data ); log.debug( "Dir upload into zipped BLOB" ); } else if(( entryCount = countZipEntries( data )) <0 ) { initFromSimpleFile( this, data ); log.debug( "Simple file upload"); } else { // its a zip, direct upload possible this.noOfFiles = entryCount; log.debug( "Zipped file upload - " + entryCount + " entries" ); this.zippedUpload = true; this.setUploadSize(data.length()); FileInputStream fis = new FileInputStream( data ); this.data = new BlobWrap(); this.data.setData(Hibernate.createBlob(fis, (int) data.length())); } } catch( IOException e ) { log.error( "Upload file failed ", e); updateStatus(ERROR); message= e.getMessage(); throw e; } } /** * Have a DataUpload object that doesnt have the data yet. This is * for uploads that run while there is no data imported to the database yet. * */ public static DataUpload create( User uploader, String originalFilename, String url ) { DataUpload upload = new DataUpload(); upload.uploader = uploader; upload.uploadDate = new Date(); upload.organization = uploader.getOrganization(); upload.originalFilename = originalFilename; upload.setSourceURL(url); return upload; } /** * Probably unused function, good for testing, uplaods a dir as zip into this DataUpload. * @param upload * @param data */ public static void initFromDir( DataUpload upload, File data ) { try { upload.tmpZip = File.createTempFile("AthenaUpload", ".zip"); FileOutputStream fos = new FileOutputStream( upload.tmpZip ); BufferedOutputStream bos = new BufferedOutputStream( fos, 8192 ); upload.noOfFiles = zipDirectory( data, bos ); upload.data = new BlobWrap(); upload.data.setData(Hibernate.createBlob(new FileInputStream(upload.tmpZip), (int) upload.tmpZip.length())); } catch( Exception e ) { log.error( "Directory upload failed, tmp zip file not created.", e); } } public static final int zipDirectory( File directory, OutputStream os ) throws IOException { ZipOutputStream zos = new ZipOutputStream( os ); int fileCount = zip( directory, directory, zos ); zos.close(); return fileCount; } public static final int zip(File directory, File base, ZipOutputStream zos) throws IOException { File[] files = directory.listFiles(); int fileCount = 0; byte[] buffer = new byte[8192]; int read = 0; for (int i = 0, n = files.length; i < n; i++) { if (files[i].isDirectory()) { // maybe we need to create the directory entry as well fileCount += zip(files[i], base, zos); } else { FileInputStream in = new FileInputStream(files[i]); ZipEntry entry = new ZipEntry(files[i].getPath().substring( base.getPath().length() + 1)); zos.putNextEntry(entry); while (-1 != (read = in.read(buffer))) { zos.write(buffer, 0, read); } zos.closeEntry(); in.close(); fileCount++; } } return fileCount; } private static void initFromSimpleFile( DataUpload upload, File data ) throws IOException { // not a zip file, needs to be zipped up upload.noOfFiles = 1; upload.zippedUpload = false; upload.setUploadSize(data.length()); // zip the file upload.tmpZip = File.createTempFile( "AthenaUpload", ".zip" ); ZipOutputStream zos = new ZipOutputStream( new FileOutputStream( upload.tmpZip )); ZipEntry ze = new ZipEntry( upload.originalFilename); ze.setMethod(ZipEntry.DEFLATED); ze.setTime(data.lastModified()); zos.putNextEntry(ze); byte[] buffer = new byte[4096]; int count; FileInputStream fis = new FileInputStream( data ); while(( count = fis.read(buffer, 0, 4096)) != -1 ) { zos.write(buffer, 0, count); } zos.closeEntry(); zos.close(); // now put in place for upload fis = new FileInputStream( upload.tmpZip ); upload.data = new BlobWrap(); upload.data.setData(Hibernate.createBlob(fis, (int) upload.tmpZip.length())); } public void finalize() { if( tmpZip != null ) { log.info( "Removing " + tmpZip.getAbsolutePath()); tmpZip.delete(); } } public boolean isOaiHarvest() { return oaiHarvest; } public void setOaiHarvest(boolean oaiHarvest) { this.oaiHarvest = oaiHarvest; } public String getSchemaName() { return schemaName; } public void setSchemaName(String schemaName) { this.schemaName = schemaName; } public boolean isLido(){ if(this.schemaName!=null && this.schemaName.equalsIgnoreCase("LIDO")) return true; else return false; } public boolean isLido10(){ if(this.schemaName!=null && this.schemaName.equalsIgnoreCase("LIDO10")) return true; else return false; } public Long getDbID() { return dbID; } public void setDbID(Long dbID) { this.dbID = dbID; } public Date getUploadDate() { return uploadDate; } public void setUploadDate(Date uploadDate) { this.uploadDate = uploadDate; } public Organization getOrganization() { return organization; } public void setOrganization(Organization organization) { this.organization = organization; } public User getUploader() { return uploader; } public void setUploader(User uploader) { this.uploader = uploader; } public int getNoOfFiles() { return noOfFiles; } public void setNoOfFiles(int noOfFiles) { this.noOfFiles = noOfFiles; } public String getSourceURL() { return sourceURL; } public void setSourceURL(String sourceURL) { this.sourceURL = sourceURL; } public String getOriginalFilename() { return originalFilename; } public void setOriginalFilename(String originalFilename) { this.originalFilename = originalFilename; } public String getStructuralFormat() { return structuralFormat; } public void setStructuralFormat(String structuralFormat) { this.structuralFormat = structuralFormat; } public boolean isHttpUpload() { return httpUpload; } public void setHttpUpload(boolean httpUpload) { this.httpUpload = httpUpload; } public boolean isAdminUpload() { return adminUpload; } public void setAdminUpload(boolean adminUpload) { this.adminUpload = adminUpload; } public boolean isZippedUpload() { return zippedUpload; } public void setZippedUpload(boolean zippedUpload) { this.zippedUpload = zippedUpload; } public long getUploadSize() { return uploadSize; } public void setUploadSize(long uploadSize) { this.uploadSize = uploadSize; } public String getMessage() { return message; } public void setMessage(String message) { this.message = message; } public int getStatus() { return status; } public void updateStatus( int status ) { this.status = status; this.uploadDate = new Date(); } public String getResumptionToken() { return resumptionToken; } public void setResumptionToken(String resumptionToken) { this.resumptionToken = resumptionToken; } public String getStatusText() { String s = statusText.get( status ); if( s == null ) return ""; return s; } public void setStatus(int status) { this.status = status; } public long getNodeCount() { return nodeCount; } public void setNodeCount(long nodeCount) { this.nodeCount = nodeCount; } /** * If the file was uploaded as zip file archive, this function * returns entry names. Provide start and end to enable * paging through many files. * * This function only works inside the session, the object needs to * access the database. * * If the upload was not zipped, returns empty list. * @param start * @param length * @return */ public List<ZipEntry> listEntries( int start, int length ) { if( !unloaded ) unloadToTmpFile(); ArrayList<ZipEntry> l = new ArrayList<ZipEntry>(); if( !zippedUpload ) return l; ZipInputStream zis = null; try { zis = new ZipInputStream(new FileInputStream( tmpZip )); ZipEntry ze; int current = 0; while(( ze = zis.getNextEntry()) != null ) { if( current>=start && current<start+length) l.add( ze ); current++; if( current >= start+length ) break; } } catch( Exception e ) { log.error( "Error while unzipping from DB", e ); } return l; } /** * Not used yet, just playing with the ZippedStream * @param targetDir * @param dataStream * @throws IOException */ public static void unzipToDir( File targetDir, InputStream dataStream ) throws IOException { ZipInputStream zipStream = new ZipInputStream( dataStream ); ZipEntry entry; while(( entry = zipStream.getNextEntry()) != null ) { int BUFFER = 2048; File destFile = new File( targetDir, entry.getName()); if( entry.getName().endsWith("/")) { destFile.mkdirs(); continue; } destFile.getParentFile().mkdirs(); FileOutputStream fos = new FileOutputStream( destFile ); BufferedOutputStream dest = new BufferedOutputStream(fos, BUFFER); log.debug("Extracting: " + destFile.getAbsolutePath()); int count; byte data[] = new byte[BUFFER]; while ((count = zipStream.read(data, 0, BUFFER)) != -1) { dest.write(data, 0, count); } dest.flush(); dest.close(); } zipStream.close(); } public XpathHolder getItemXpath() { return itemXpath; } public void setItemXpath(XpathHolder itemXpath) { this.itemXpath = itemXpath; } public XpathHolder getItemLabelXpath() { return itemLabelXpath; } public void setItemLabelXpath(XpathHolder itemLabelXpath) { this.itemLabelXpath = itemLabelXpath; } /** * If its not a zip entry, this will just stream the file * out of the db. * @return */ public InputStream getEntry() throws SQLException, IOException { InputStream result = null; if( !unloaded) unloadToTmpFile(); try { ZipInputStream zis = new ZipInputStream( new FileInputStream( tmpZip )); zis.getNextEntry(); result = zis; } catch( Exception e ) { log.error( "Zip single file return failed ", e ); } return result; } /** * You can get a file streamed back by supplying the name. * (Get the name from listEntries() * @param name * @return */ public InputStream getEntry( String name ) { InputStream result = null; if( !unloaded) unloadToTmpFile(); try { ZipInputStream zis = new ZipInputStream( new FileInputStream( tmpZip )); ZipEntry ze = null; while(( ze = zis.getNextEntry()) != null ) { if( ze.getName().equals(name )) { result = zis; break; } } } catch( Exception e ) { log.error( "Reading of entry failed ", e ); } return result; } /** * Write an entry processor to process all files in an upload. * Even with one file in the upload call this, its easier! * @param ep */ public void processAllEntries( EntryProcessor ep ) throws Exception { InputStream is = null; de.schlichtherle.util.zip.ZipEntry ze = null; if( tmpZip == null ) unloadToTmpFile(); try { BasicZipFile bz = new BasicZipFile( tmpZip ); Enumeration entries = bz.entries(); while( entries.hasMoreElements() ) { ze = (de.schlichtherle.util.zip.ZipEntry) entries.nextElement(); InputStream zis = bz.getInputStream(ze); // log.debug( "Processing " + ze.getName()); try { ep.processEntry(ze, zis); } catch( Exception e ) { is = bz.getInputStream(ze); StringWriter sw = new StringWriter(); int count = 0; int readByte; int[] buffer = new int[16]; while(( readByte = is.read()) >= 0 ) { buffer[count%16] = readByte; count +=1; sw.write( String.format( "%02x ", readByte )); if(( count % 16) == 0 ) { sw.write( new String( buffer, 0, 16 )); sw.write( "\n" ); } if( count == 160 ) break; } log.info( "Problematic file: \n" + sw.toString()); throw e; } } } catch( Exception e ) { String msg; if(( e instanceof IllegalArgumentException ) && ( e.getMessage() == null ) && (ze == null)) { // we know this one msg = "Unknown encoding in zip filename entries."; } else { String entryName = "<unknown>"; if( ze != null ) entryName= ze.getName(); log.error( "Reading of entry "+ entryName + " failed ", e ); msg = "Reading of entry " + entryName + " failed.\n" + e.getMessage(); } throw new Exception( msg, e ); } finally { try { is.close(); } catch( Exception e ){}; } } public static int countZipEntries( File file ) { try { ZipFile zf = new ZipFile( file ); return zf.size(); } catch( Exception e ) { log.debug( file.getAbsolutePath() + " is no zip file!" ); } return -1; } /** * Get the uploaded data as stream from a tmp file. * Its zip format, even if the upload was not zipped!! * * Direct db streaming didn't work .. * @return InputStream, close after reading! */ public InputStream getDownloadStream() { InputStream is = null; if( tmpZip == null ) unloadToTmpFile(); try { is = new FileInputStream( tmpZip ); } catch( Exception e ) { log.error( "File unload problem", e); } return is; } /** * Because of trouble reopening a BLOB the blob is copied to the filesystem .. */ private void unloadToTmpFile() { try { tmpZip = File.createTempFile("unloaded", ".zip"); log.info( "Unloading to " + tmpZip.getAbsolutePath()); FileOutputStream fos = new FileOutputStream( tmpZip ); BufferedOutputStream bos = new BufferedOutputStream( fos,4096 ); InputStream is = data.getData().getBinaryStream(); byte[] buffer = new byte[2048]; int count; while(( count = is.read( buffer, 0, 2048 )) != -1) { bos.write(buffer, 0, count ); } is.close(); bos.close(); DB.commit(); unloaded = true; } catch( Exception e ) { log.error( "Cannot copy BLOB to tmp file", e ); } } public BlobWrap getData() { return data; } public void setData(BlobWrap data) { this.data = data; } public String getLockname() { return new SimpleDateFormat("dd/MM/yyyy HH:mm").format(getUploadDate()) + " " + getOriginalFilename(); } public ArrayList<String> listOfXPaths() { ArrayList<String> list = new ArrayList<String>(); XpathHolder rootXpath = this.getRootXpath(); if(rootXpath != null) { List<? extends TraversableI> children = rootXpath.getChildren(); for(TraversableI t: children) { XpathHolder xp = (XpathHolder) t; list.addAll(xp.listOfXPaths(true)); } } return list; } public List<Transformation> getTransformations() { return DB.getTransformationDAO().findByUpload(this); } } /* create table data_upload ( data_upload_id int primary key, upload_date timestamp, organization_id int references organization, uploader_id int references users, -- only for zips its bigger than 1 -- no_of_files int, -- for OAI reps here goes the URL -- source_url text, -- http uploads should provide this -- original_filename text, -- xml, excel and other later if we provide -- structural_data_format text, -- was it an http upload ? -- http_upload boolean, -- if admin upload is true, files reach the server somehow (email.. jikes) -- and the admin performs the upload admin_upload boolean, -- was the original data zipped? -- zipped_upload boolean, -- how many bytes were uploaded ? -- -- for a zip file this will be pretty much the same as the db -- -- blob size -- upload_size bigint, -- zipped content of whatever was uploaded in whatever way -- data oid ); */