JxtImportServiceImpl.java example

Explorer
juxta-service-master
package org.juxtasoftware.service.importer.jxt;

import static org.juxtasoftware.service.importer.jxt.Util.isContainedIn;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import javax.xml.stream.XMLStreamException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.json.simple.JSONObject;
import org.juxtasoftware.Constants;
import org.juxtasoftware.dao.AlignmentDao;
import org.juxtasoftware.dao.CacheDao;
import org.juxtasoftware.dao.ComparisonSetDao;
import org.juxtasoftware.dao.JuxtaAnnotationDao;
import org.juxtasoftware.dao.JuxtaXsltDao;
import org.juxtasoftware.dao.SourceDao;
import org.juxtasoftware.dao.WitnessDao;
import org.juxtasoftware.dao.WorkspaceDao;
import org.juxtasoftware.model.Alignment;
import org.juxtasoftware.model.Alignment.AlignedAnnotation;
import org.juxtasoftware.model.CollatorConfig;
import org.juxtasoftware.model.ComparisonSet;
import org.juxtasoftware.model.JuxtaAnnotation;
import org.juxtasoftware.model.JuxtaXslt;
import org.juxtasoftware.model.Source;
import org.juxtasoftware.model.Source.Type;
import org.juxtasoftware.model.Witness;
import org.juxtasoftware.model.Workspace;
import org.juxtasoftware.service.ComparisonSetCollator;
import org.juxtasoftware.service.SourceTransformer;
import org.juxtasoftware.service.Tokenizer;
import org.juxtasoftware.service.importer.ImportService;
import org.juxtasoftware.service.importer.JuxtaXsltFactory;
import org.juxtasoftware.service.importer.XmlTemplateParser;
import org.juxtasoftware.service.importer.XmlTemplateParser.TemplateInfo;
import org.juxtasoftware.service.importer.jxt.JxtRevisionExtractor.RevisionOccurrence;
import org.juxtasoftware.service.importer.jxt.ManifestParser.SourceInfo;
import org.juxtasoftware.service.importer.jxt.MovesParser.JxtMoveInfo;
import org.juxtasoftware.util.BackgroundTaskSegment;
import org.juxtasoftware.util.BackgroundTaskStatus;
import org.juxtasoftware.util.NamespaceExtractor;
import org.juxtasoftware.util.NamespaceExtractor.NamespaceInfo;
import org.juxtasoftware.util.NamespaceExtractor.XmlType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Service;
import org.xml.sax.SAXException;

import com.google.common.base.Preconditions;
import com.google.common.io.ByteStreams;
import com.google.common.io.Closeables;
import com.google.common.io.Files;

import eu.interedition.text.Name;
import eu.interedition.text.NameRepository;

/**
 * Service used to import JXT files from the desktop juxta. It supports only the 
 * latest version of juxta desktop (1.6.5). There is no support for target xpath
 * in the parse templates. For imports of documents using the juxta-document template,
 * the biblio data is stripped out to help compensate for this.
 * 
 * @author loufoster
 *
 */
@Service
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class JxtImportServiceImpl implements ImportService<InputStream> {
    @Autowired private ManifestParser manifestParser;
    @Autowired private MovesParser movesParser;
    @Autowired private XmlTemplateParser templateParser;
    @Autowired private JuxtaXsltFactory xsltFactory;
    @Autowired private SourceDao sourceDao;
    @Autowired private JuxtaXsltDao xsltDao;
    @Autowired private NameRepository nameRepo;
    @Autowired private CacheDao cacheDao;
    @Autowired private SourceTransformer transformer;
    @Autowired private ComparisonSetDao setDao;
    @Autowired private WitnessDao witnessDao;
    @Autowired private Tokenizer tokenizer;
    @Autowired private ComparisonSetCollator collator;
    @Autowired private JuxtaAnnotationDao annotationDao;
    @Autowired private AlignmentDao alignmentDao;
    @Autowired private WorkspaceDao workspaceDao;
    
    private Workspace ws;
    private ComparisonSet set;
    private BackgroundTaskStatus taskStatus;
    private BackgroundTaskSegment taskSegment;
    protected static final Logger LOG = LoggerFactory.getLogger( Constants.WS_LOGGER_NAME );
    

    /**
     * Import a JXT file from juxta version 1.6 or greater. Older versions will cause this method
     * to throw 
     * @param set
     * @return
     */
    @Override
    public void doImport(final ComparisonSet set,final InputStream jxtIs, BackgroundTaskStatus status) throws Exception {
        final File sessionDataDir = Files.createTempDir();
        
        // stuff key data into class members
        this.set = set;
        this.taskStatus = status;
        this.ws = this.workspaceDao.find(this.set.getWorkspaceId());
        
        // segment the status tracker to match the steps in the import process:
        //  unzip, parse manifest, parse moves, parse templates, populate set, 
        //  add transpositions, tokenize, collate
        final int importSteps = 8;
        this.taskSegment = this.taskStatus.add(1, new BackgroundTaskSegment( importSteps ));

        try {
            // unzip the jxt file into a temp directory
            this.taskStatus.setNote("Inflating JXT data");
            unzip(sessionDataDir, jxtIs);
            this.taskSegment.incrementValue();
            
            // parse the manifest found in the unzipped
            // directory into a list of source data
            LOG.info("Parsing manifest");
            List<SourceInfo> sources = parseManifest(sessionDataDir);
            
            // parse out any transpositions
            LOG.info("Parsing moves");
            List<JxtMoveInfo> moves = parseMoves(sessionDataDir);
            
            // Grab the associated templates and parse them out into a map.
            // this map is held internally in the template parser.
            LOG.info("Parsing templates");
            parseTemplates(sessionDataDir);
                      
            // combine all of this data into the comparison set
            LOG.info("Create set");
            prepareSet( );
            populateSet( sources, moves );
            
            // add in the transpositions!
            addTranspositions( moves );
                        
            // tokenize and collate
            CollatorConfig cfg = this.setDao.getCollatorConfig(this.set);
            this.set.setStatus(ComparisonSet.Status.COLLATING);
            tokenize( cfg );
            collate( cfg );
            
            this.taskStatus.setNote("Import successful");
            
        } finally {
            try {
                FileUtils.deleteDirectory(sessionDataDir);
            } catch (IOException e) {}
        }
    }
    
    private void prepareSet() {

        // grab all witnesses associated with this set.
        // If there are none, there is nothing more to do
        List<Witness> witnesses = this.setDao.getWitnesses(this.set);
        if ( witnesses.size() == 0) {
            return;
        }
        
        // clear out all prior data (NOTE: delete all witnesses wil also clear out all
        // aligment and annotation data )
        this.setDao.deleteAllWitnesses(this.set);
        this.cacheDao.deleteAll(this.set.getId());
        try {
            for (Witness witness : witnesses) {
                Source s = this.sourceDao.find(this.ws.getId(), witness.getSourceId());
                JuxtaXslt xslt = this.xsltDao.find(witness.getXsltId());
                this.witnessDao.delete(witness);
                this.sourceDao.delete(s);
                this.xsltDao.delete(xslt);
            }
        } catch (Exception e) {
            throw new RuntimeException("Unable to overwrite set; witnesses are in use in another set.");
        }
    }
    
    private void addTranspositions(List<JxtMoveInfo> moves) {
        this.taskStatus.setNote("Adding Transpositions");
        Name transQname = this.nameRepo.get(Constants.TRANSPOSITION_NAME) ;
        LOG.info("Adding Transpositions");
        List<Alignment> moveLinks = new ArrayList<Alignment>();
        for ( JxtMoveInfo move : moves ) {
            Alignment link = new Alignment();
            link.setComparisonSetId(this.set.getId());
            link.setManual();
            link.setName( transQname );
            
            for ( String title : move.getWitnessTitles() ) {
                Witness witness = this.witnessDao.find(this.set, title);
                if ( witness == null ) {
                    LOG.error("Unable to import moves. Witness "+title+" not found");
                    this.taskSegment.incrementValue();
                    return;
                }
                JuxtaAnnotation anno = new JuxtaAnnotation( this.set.getId(), witness, transQname, move.getWitnessRange(title) );
                anno.setManual();
                Long annoId = this.annotationDao.create(anno);
                AlignedAnnotation aa =  new AlignedAnnotation(anno.getName(), witness.getId(), annoId, anno.getRange());
                link.addAnnotation( aa );
            }
            moveLinks.add(link);
        }
        this.alignmentDao.create(moveLinks);
        
        this.taskSegment.incrementValue();
        
    }

    /**
     * Parse a list of <code>SourceInfo</code> objects from the manifest file
     * @param sessionDataDir
     * @return
     * @throws Exception
     */
    private List<SourceInfo> parseManifest( File sessionDataDir) throws Exception {
        this.taskStatus.setNote("Parsing manifest");
        final File manifest = new File(sessionDataDir, "manifest.xml");
        if (!manifest.isFile()) {
            throw new IOException("No manifest.xml");
        }
        this.manifestParser.parse(this.set, sessionDataDir, manifest);
        List<SourceInfo> sources = this.manifestParser.getSources();
        this.taskSegment.incrementValue();
        return sources;    
    }
    
    /**
     * Parse a list of <code>MoveInfo</code> objects from the moves file
     * @param sessionDataDir
     * @return
     * @throws Exception
     */
    private List<JxtMoveInfo> parseMoves( File sessionDataDir) throws Exception {
        this.taskStatus.setNote("Parsing moves");
        final File movesFile = new File(sessionDataDir, "moves.xml");
        if (!movesFile.isFile()) {
            throw new IOException("No moves.xml");
        }
        List<JxtMoveInfo> moves = this.movesParser.parse(this.set, movesFile);
        this.taskSegment.incrementValue();
        return moves;    
    }
    
    /**
     * Parse out all parse templates from the templates.xml file.
     * @param sessionDataDir
     * @throws Exception
     */
    private void parseTemplates( File sessionDataDir ) throws Exception {
        this.taskStatus.setNote("Parsing templates");
        final File templates = new File(sessionDataDir, "templates.xml");
        if (!templates.isFile()) {
            throw new IOException("No templates.xml");
        }
        this.templateParser.parse( new FileInputStream(templates ) );
        this.taskSegment.incrementValue();
    }
    
      
    /**
     * Create all sources, templates and witnesses and use them to populate the 
     * comparison set with the new sources. Update the base document too.\
     * 
     * @param sources
     * @throws Exception
     */
    private void populateSet( List<SourceInfo> sources, List<JxtMoveInfo> moves ) throws Exception {
        // Use collected data to create soures, templates, witness and
        // add them all to the comparison set
        Set<Witness> witnesses = new HashSet<Witness>();
        this.taskStatus.setNote("Adding witnesses to comparison set");
        for ( SourceInfo srcInfo : sources ) {

            // determine type of source
            String srcName = JSONObject.escape(srcInfo.getSrcFile().getName());
            this.taskStatus.setNote("Adding raw source document: "+srcName); 
            int extPos = srcName.lastIndexOf('.');
            String ext = ".txt";
            if ( extPos > -1 ) {
                ext = srcName.substring(extPos);
            }
            Source.Type contentType = Source.Type.TXT;
            if ( ext.equalsIgnoreCase(".xml") ) {
                contentType = Source.Type.XML;
            }

            // create the juxta source
            Source source = createSource(srcInfo, contentType);     
                  
            // if the source was associated with a parse template,
            // create it and use it to transform to a witness
            this.taskStatus.setNote("Transform raw "+srcName+" into witness");
            Long witnessId = null;
            JuxtaXslt xslt = null;
            if ( contentType.equals(Source.Type.XML) ) {
                // extract namespace info
                Set<NamespaceInfo> namespaces = NamespaceExtractor.extract( this.sourceDao.getContentReader(source) ); 
                NamespaceInfo namespace = NamespaceInfo.createBlankNamespace();
                if ( namespaces.size() == 1 ) {
                    namespace = (NamespaceInfo)namespaces.toArray()[0];
                    XmlType xmlType = NamespaceExtractor.determineXmlType( this.sourceDao.getContentReader(source) );
                    if ( xmlType.equals(XmlType.TEI)) {
                        namespace.setDefaultPrefix("tei");
                    }
                }
                
                // record any accepted revisions this witness may have had
                TemplateInfo info = this.templateParser.findTemplateInfo(srcInfo.getTemplateGuid());
                xslt = this.xsltFactory.createFromTemplateInfo(source.getWorkspaceId(), srcInfo.getTitle(), info, namespace);
                addRevisonExclusions(source, xslt, namespace, srcInfo.getAcceptedRevsions() );
                witnessId = this.transformer.transform(source, xslt, srcInfo.getTitle());
            } else {
                // Just null transform it to a witness
              witnessId = this.transformer.transform(source, null, source.getName());
            }

            // add all witnesses to the set and update with base witness
            Witness newWitness = this.witnessDao.find(witnessId);
            witnesses.add(  newWitness );
        }

        this.taskStatus.setNote("Create comparison set");
        this.setDao.addWitnesses(this.set, witnesses);
        this.setDao.update(this.set);
        this.taskSegment.incrementValue();
    }

    private void addRevisonExclusions(Source source, JuxtaXslt xslt, NamespaceInfo namespace, List<Integer> acceptedRevsions) throws SAXException, IOException {
        if ( acceptedRevsions.size() == 0 ) {
            // when none are accepted, add an exclusion for all 
            // add tag and addSpan tags. The deletes remain
            xslt.addGlobalExclusion( namespace.addNamespacePrefix("add") );
            xslt.addGlobalExclusion( namespace.addNamespacePrefix("addSpan") );
        } else {
            // extract the exclusion info and add single exclusions to the XSLT
            JxtRevisionExtractor extractor = new JxtRevisionExtractor();
            extractor.extract( this.sourceDao.getContentReader(source), acceptedRevsions);
            for (RevisionOccurrence rev : extractor.getExcludedRevisions() ) {
                xslt.addSingleExclusion( namespace.addNamespacePrefix(rev.getTagName()), rev.getOccurrence() );
            }
        }
        this.xsltDao.update(xslt.getId(), new StringReader(xslt.getXslt()));
    }
    
    private Source createSource(SourceInfo srcInfo, Type contentType) throws FileNotFoundException, IOException, XMLStreamException {
        
        String name = srcInfo.getTitle();
        if ( this.sourceDao.exists(this.ws, name)) {
            name = this.sourceDao.makeUniqueName(this.ws, name);
            srcInfo.setTitle(name);
        }
        FileInputStream fis = new FileInputStream(srcInfo.getSrcFile());
        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
        Long srcId = this.sourceDao.create(this.ws, name, contentType, isr);
        IOUtils.closeQuietly(isr);
        return this.sourceDao.find(this.ws.getId(), srcId);
    }
    
    /**
     * Tokenize the comparison set
     * @param cfg  
     * @throws IOException
     */
    private void tokenize( CollatorConfig cfg ) throws IOException {
        this.taskStatus.setNote("Tokenizing comparison set");
        this.tokenizer.tokenize(this.set, cfg, this.taskStatus);
        this.taskSegment.incrementValue();
    }
    
    /**
     * Collate the comparison set
     * @throws IOException
     */
    private void collate( CollatorConfig cfg ) throws IOException {
        this.taskStatus.setNote("Collating comparison set");
        this.collator.collate(this.set, cfg, this.taskStatus);
        this.taskSegment.incrementValue();
    }
    
    private void unzip(File to, InputStream jxtIs) throws IOException {

        final ZipInputStream zip = new ZipInputStream(jxtIs);
        while (true) {
            final ZipEntry entry = zip.getNextEntry();
            if (entry == null) {
                break;
            }

            final File entryFile = new File(to, entry.getName());
            Preconditions.checkArgument(isContainedIn(to, entryFile));

            if (!entry.isDirectory()) {
                final File parentFile = entryFile.getParentFile();
                if (!parentFile.isDirectory()) {
                    parentFile.mkdirs();
                }
                Preconditions.checkState(parentFile.isDirectory());
                FileOutputStream entryStream = null;
                try {
                    ByteStreams.copy(zip, entryStream = new FileOutputStream(entryFile));
                } finally {
                    Closeables.close(entryStream, false);
                }
            }
        }
    }
}