package gr.ntua.ivml.athena.persistent; import gr.ntua.ivml.athena.concurrent.Ticker; import gr.ntua.ivml.athena.db.DB; import gr.ntua.ivml.athena.util.Config; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.hibernate.Hibernate; import de.schlichtherle.util.zip.ZipEntry; import de.schlichtherle.util.zip.ZipOutputStream; /** * This class summarizes all information needed to export a selection of * DataUploads. Need to encapsulate special target schema logic * Subclass? How do I do the Hibernate stuff ?? * * @author Arne Stabenau * */ public class Publication { public static class NodeContainer { public static final int LIDOVS09 = 1; public static final int LIDOVS10 = 2; public XMLNode node; public int schema; } public static class PathIterator implements Iterator<NodeContainer> { private static final XMLNode[] templatePage = new XMLNode[0]; List<Transformation> transformations; Iterator<Transformation> iterTransform; String path; XpathHolder currentHolder; Transformation currentTransformation; XMLNode nextItem; XMLNode[] page; int nextInPage; public PathIterator( List<Transformation> l, String path ) { transformations = l; iterTransform = transformations.iterator(); this.path = path; //next(); nextItem = nextInPage(); } @Override public boolean hasNext() { return nextItem != null; } private boolean nextHolder() { if( iterTransform.hasNext() ) { currentTransformation = iterTransform.next(); XmlObject xo = currentTransformation.getParsedOutput(); currentHolder = xo.getRoot().getByRelativePath(path); if( currentHolder != null ) log.debug( "Current transformation has " + currentHolder.getCount() + " items." ); } else { currentHolder = null; } return currentHolder != null; } /** * Retrieve next page from current holder or first from next * @return if there is stuff left */ private boolean nextPage() { List<XMLNode> l = null; if( page != null ) l= currentHolder.getNodes( page[page.length-1], 100); if(( page== null ) || ( l.size() == 0 )) { nextHolder(); if( currentHolder == null ) return false; l= currentHolder.getNodes( 0, 100); if( l.size() == 0 ) throw new RuntimeException( "Unexpected result, should have nodes"); } page = l.toArray(templatePage); nextInPage = 0; return true; } private XMLNode nextInPage() { XMLNode result=null; if(( page==null ) || ( nextInPage==page.length )) { if( ! nextPage()) return null; } result = page[nextInPage]; nextInPage+=1; return result; } @Override public NodeContainer next() { NodeContainer result = new NodeContainer(); result.node = nextItem; if( currentTransformation.getDataUpload().isLido10()) result.schema = NodeContainer.LIDOVS10; else result.schema = NodeContainer.LIDOVS09; nextItem = nextInPage(); return result; } @Override public void remove() { throw new NoSuchMethodError(); } } public static final Logger log = Logger.getLogger( Publication.class ); public static final int ERROR=-1; public static final int OK=0; public static final int IDLE=1; public static final int CONSOLIDATE=2; public static final int VERSION=3; public static final int POSTPROCESS=4; Long dbID; // all affected DataUpload objects List<DataUpload> inputUploads = new ArrayList<DataUpload>(); // example stats on the this publication, more could be collected long itemCount; // which user did the publication User publishingUser; Organization publishingOrganization; // status information on the progress of publication String statusMessage; int statusCode; String report; // when the publication was initiated Date lastProcess; // the final output in zipped form // either one or many files, possible millions BlobWrap zippedOutput; // name of output. With this the correct Transformations are selected String targetSchema; // transient only valid while in progress File workdir; File tmpFile; public Long getDbID() { return dbID; } public void setDbID(Long dbID) { this.dbID = dbID; } public List<DataUpload> getInputUploads() { return inputUploads; } public void setInputUploads(List<DataUpload> inputUploads) { this.inputUploads = inputUploads; } public long getItemCount() { return itemCount; } public void setItemCount(long itemCount) { this.itemCount = itemCount; } public User getPublishingUser() { return publishingUser; } public void setPublishingUser(User publishingUser) { this.publishingUser = publishingUser; } public Organization getPublishingOrganization() { return publishingOrganization; } public void setPublishingOrganization(Organization publishingOrganization) { this.publishingOrganization = publishingOrganization; } public String getStatusMessage() { return statusMessage; } public void setStatusMessage(String statusMessage) { this.statusMessage = statusMessage; } public int getStatusCode() { return statusCode; } public void setStatusCode(int statusCode) { this.statusCode = statusCode; } public Date getLastProcess() { return lastProcess; } public void setLastProcess(Date lastProcess) { this.lastProcess = lastProcess; } public String getReport() { return report; } public void setReport(String report) { this.report = report; } public BlobWrap getZippedOutput() { return zippedOutput; } public void setZippedOutput(BlobWrap zippedOutput) { this.zippedOutput = zippedOutput; } public String getTargetSchema() { return targetSchema; } public void setTargetSchema(String targetSchema) { this.targetSchema = targetSchema; } /** * Call this to check if the publication is still valid. * It should check whether changes in the input data (transformations, mappings) * are not reflected here. * * return true if Publication is still valid. */ public boolean validate() { // go through all relevant transformations and check if any have // dates after the process of this Publication. return true; } /** * Check if the current state is still valid. * Check if a new processing round has to be done. * Do it (version, apply changes, pullup of changes, consolidate in one xml-object ) */ public void process() { File consolidated = null; File processed = null; try { if( !upToDateCheck()) { version(); applyChanges(); consolidated = consolidate(); processed = postProcess( consolidated ); writeBack( processed ); setLastProcess(new Date()); setStatusCode(OK); setStatusMessage("Processed and ready for download"); } } catch( Exception e ) { if( getStatusCode() != ERROR ) { setStatusCode(ERROR); setStatusMessage("Publication processing failed with: " + e.getMessage()); } // didn't work, remove transformations from upload getInputUploads().clear(); log.error( "processing of Publication failed.", e ); } finally { if( consolidated != null ) consolidated.delete(); if( processed != null ) processed.delete(); DB.commit(); } } /** * Convenience function to remove an upload. No processing is started. * @param du */ public void removeUpload( DataUpload du ) { Iterator<DataUpload> i = getInputUploads().iterator(); while( i.hasNext() ) { DataUpload du2 = i.next(); if( du2.getDbID() == du.getDbID()) { i.remove(); return; } } } /** * * @param du */ public boolean containsUpload( DataUpload du ) { Iterator<DataUpload> i = getInputUploads().iterator(); while( i.hasNext() ) { DataUpload du2 = i.next(); if( du2.getDbID() == du.getDbID()) { return true; } } return false; } /** * Convenience function to remove an upload, no reprocessing is started. * @param du */ public void addUpload( DataUpload du ) { getInputUploads().add( du ); } public List<Transformation> getTransformations() throws Exception { ArrayList<Transformation> al = new ArrayList<Transformation>(); // input uploads need sorting List<DataUpload> l = getInputUploads(); Collections.sort(l, new Comparator<DataUpload>() { public int compare( DataUpload a, DataUpload b ) { if( a.getUploadDate().before(b.getUploadDate())) return -1; if( a.getUploadDate().after( b.getUploadDate())) return 1; return 0; } }); for( DataUpload du: l ) { boolean hasTransformation = false; for( Transformation tr: du.getTransformations()) { String target1 = tr.getMapping().getTargetSchema(); if( target1 != null ) { if( target1.equals( getTargetSchema())) { if( tr.getStatusCode() == Transformation.OK) { al.add( tr ); hasTransformation = true; } } } else { if( getTargetSchema() == null ) { if( tr.getStatusCode() == Transformation.OK) { al.add( tr ); hasTransformation = true; } } } } if( ! hasTransformation ) throw new Exception( "Upload has no suitable Transformation" ); } return al; } // Section with real work /** * All involved Transformations happened before the last process date. * @return */ public boolean upToDateCheck() throws Exception { List<Transformation> l = getTransformations(); Date lastProcess = getLastProcess(); if( lastProcess == null ) return false; for( Transformation tr: l ) { if( ! tr.getEndTransform().before( lastProcess )) return false; } // all happened before last process, we are up to date return true; } /** * Create the List of items with available newer versions. * @throws Exception */ public void version() throws Exception { // do nothing for now } /** * Apply the changeset to the latest version of an item. * @throws Exception */ public void applyChanges() throws Exception { // do nothing for now } /** * Create the output XML object / files (not sure yet) * @throws Exception */ public File consolidate() throws Exception { long counter = 0l; File consolidated = File.createTempFile("consolidated", ".zip"); log.debug( "Consolidate into " + consolidated.getAbsolutePath()); ZipOutputStream zos = new ZipOutputStream(new FileOutputStream( consolidated )); Iterator<NodeContainer> i = itemize(); // every 20 seconds is set Ticker t = new Ticker( 20 ); while( i.hasNext() ) { NodeContainer nc = i.next(); XMLNode n = nc.node; XMLNode tree = DB.getXMLNodeDAO().wrappedDOMTree(n); counter += 1; String entryname; if( nc.schema == NodeContainer.LIDOVS09) entryname = "lido09_"+n.getNodeId() + ".xml"; else entryname = "lido10_"+n.getNodeId() + ".xml"; zos.putNextEntry(new ZipEntry(entryname )); PrintWriter pw = new PrintWriter( new OutputStreamWriter(zos, "UTF8" ) { public void close() {}; } ); tree.toXml(pw); pw.flush(); zos.closeEntry(); if( t.isSet() ) { t.reset(); log.debug( "Consolidated " + counter + " items for " + getPublishingOrganization().getName() ); setStatusMessage("Consolidated " + counter + " items." ); DB.commit(); } DB.getSession().evict(n); } zos.flush(); zos.close(); t.cancel(); log.debug( "Consolidated " + counter + " items for " + getPublishingOrganization().getName() ); return consolidated; } /** * Do some processing after the consolidation?? * @throws Exception */ public File postProcess( File input ) throws Exception { return input; } /** * Not sure what I need this yet. Somehow the ZIP approach to store item XML * might be performance desaster, I don't trust it will last. Final version * probably is memory mapped big file with index file. * @return * @throws IOException */ public static File createTempDirectory() throws IOException { final File temp; temp = File.createTempFile("temp", Long.toString(System.nanoTime())); if(!(temp.delete())) { throw new IOException("Could not delete temp file: " + temp.getAbsolutePath()); } if(!(temp.mkdir())) { throw new IOException("Could not create temp directory: " + temp.getAbsolutePath()); } return (temp); } /** * Write all input items into the workdir in some form. It could be a million, so files in one dir * is not a good approach ( doesn't scale ). * - try zip archive?? * - subdir approach ? * - one file with index file ? * * @throws Exception */ public Iterator<NodeContainer> itemize() throws Exception { return null; } /** * This method write a line with different mapping of item and item parts * to sortable (float, string) values. Sorting by these values and comparing * adjacient values establishes how similar the items * are for that indexed value and this similarity will be used for item to item * scoring. * * Standard is to write item_id and checksum out, so that duplication can be eliminated. * * @param tree * @param out */ public void writeIndexLine( XMLNode tree, PrintWriter out ) { out.write( tree.getNodeId()+"\t" ); out.write( tree.getChecksum() + "\n"); } /** * Score two values in given column against each other. If they are considered too far * for scoring throw Exception to advance the scoring window. * * Default is identity scoring (1 if identical, Exception if not) * @param column * @param item1Value * @param item2Value * @return * @throws Exception */ public float partialScore( int column, String item1Value, String item2Value ) throws Exception { if( item1Value.equals( item2Value )) return 1f; throw new Exception(); } /** * If a column is numeric, the sorting will happen numerically otherwise * lexically. Usually indices will be numeric unless the score is an identity check. * * The default is the identity check index. * @param column * @return */ public boolean isNumericIndex( int column ) { return false; } /** * First creates the scores for each attribute that needs to be scored. * Then combines the scores for each pair. */ public void buildScoringMatrix() { /* * for column=1 to index count * sort index file by column * create scoring file * for line in index file * addLineToWindow( window, line, scoreColumn, scoreWriter ) * sort scoring files on first and second id * open each scoring file and walk through, calling the * score accumulating function with all scores for one pair of * ids. */ } /** * Scores added line against all other lines in the window and removes * lines from the window that are no longer in scoring range. * Writes scores to partialScores and assumes tab delimited files with * node id on first position (0-column) * @param window * @param line * @param scoreColumn * @param partialScores */ private void addLineToWindow( LinkedList<String[]> window, String line, int scoreColumn, PrintWriter partialScores ) { String[] fields = line.split("\\t"); window.add(fields); ListIterator<String[]> i = window.listIterator(); while( i.nextIndex() < window.size()-1 ) { String comp[] = i.next(); try { float score = partialScore( scoreColumn, fields[scoreColumn], comp[scoreColumn]); long id1 = Long.parseLong(fields[0]); long id2 = Long.parseLong( comp[0] ); if( id1 < id2 ) partialScores.println( fields[0]+ "\t"+ comp[0] + "\t" + score ); else partialScores.println( comp[0]+ "\t"+ fields[0] + "\t" + score ); } catch( Exception e ) { // this line is out of the window i.remove(); } } } /** * How many partial scores do you want to produce? Default is * 1 for the checksum identity of item. * @return */ public int getIndexCount() { return 1; } /** * The given file (which needs to be a ZIP archive) is written back as * BLOB to the database. * @param result */ public void writeBack( File result ) { try { zippedOutput = new BlobWrap(); zippedOutput.data = Hibernate.createBlob( new FileInputStream( result ), (int) result.length()); setStatusCode(OK); DB.commit(); // result.delete(); } catch( Exception e ) { log.error( "Writeback failed!", e ); try { setStatusCode(ERROR); setStatusMessage(e.getMessage()); DB.commit(); } catch( Exception e2 ) { log.error( "Status update failed as well!!", e2 ); } } } public File getTmpFile(){ return this.tmpFile; } public void unloadToTmpFile() { try { tmpFile = File.createTempFile("unloadPublication", ".zip"); tmpFile.deleteOnExit(); log.info( "Unloading to " + tmpFile.getAbsolutePath()); FileOutputStream fos = new FileOutputStream( tmpFile ); BufferedOutputStream bos = new BufferedOutputStream( fos,4096 ); InputStream is = getZippedOutput().getData().getBinaryStream(); IOUtils.copy(is, bos); is.close(); bos.flush(); bos.close(); DB.commit(); } catch( Exception e ) { log.error( "Cannot copy BLOB to tmp file", e ); } } /** * Returns a stream to a zip archive. Please cleanup after finished with the Stream. * @return */ public InputStream getDownloadStream() { InputStream is = null; if( tmpFile == null ) unloadToTmpFile(); try { is = new FileInputStream(tmpFile); } catch( Exception e ) { log.error( "File unload problem", e); } return is; } /** * delete the tmp file after using the Download Stream. This will be automated later. */ public void cleanup() { tmpFile.delete(); } } /* * How should the process work? * a) Collect all the items from the transformations, building an index of each item, which should allow for the following: * - access each item * - score items against each other, the index might contain many columns with scores on certain metrics * scores between items are only build from neighboring items in the index (avoid n^2 complexity) * - the collection is happening as XML in files! - current approach, one ZIP archive, but this might not work for * millions of items * * b) .. skip other steps so far .. * c) post process by XSL transform to ESE * d) final result is uploaded as ZIP archive to database. */