package org.genedb.web.mvc.model; import org.genedb.db.audit.ChangeSet; import org.gmod.schema.feature.AbstractGene; import org.gmod.schema.feature.Gap; import org.gmod.schema.feature.Gene; import org.gmod.schema.feature.Polypeptide; import org.gmod.schema.feature.Transcript; import org.gmod.schema.mapped.Feature; import org.apache.log4j.Logger; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.hibernate.CacheMode; import org.hibernate.Criteria; import org.hibernate.FlushMode; import org.hibernate.ScrollMode; import org.hibernate.ScrollableResults; import org.hibernate.Session; import org.hibernate.SessionFactory; import org.hibernate.Transaction; import org.hibernate.criterion.Restrictions; import org.hibernate.search.FullTextSession; import org.hibernate.search.Search; import org.hibernate.search.SearchFactory; import org.hibernate.search.reader.ReaderProvider; import org.hibernate.search.store.DirectoryProvider; import org.springframework.context.ConfigurableApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import uk.co.flamingpenguin.jewel.cli.ArgumentValidationException; import uk.co.flamingpenguin.jewel.cli.Cli; import uk.co.flamingpenguin.jewel.cli.CliFactory; import uk.co.flamingpenguin.jewel.cli.Option; import java.io.Console; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import com.google.common.collect.Sets; /** * Create Lucene indices. * <p> * The way it works is as follows: * A list of feature_ids is generated that must be deleted and/or updated. This can be done by: * (i) Indexing by type. Each indexed feature type is treated in turn; currently * the classes <code>AbstractGene</code>, <code>Transcript</code> and * <code>Gap</code> are indexed (in that order). For each type, all features of that type * are listed * (ii) The first option can be limited by a given organism * (iii) A list of features can be provided through a <code>ChangeSet</code> * * Given this list, the features are loaded and indexed in batches of 10. * If an exception is thrown while indexing a particular feature, the exception is caught and * the whole batch will fail. * The members of the failed batch are then put into a queue. When all batches of the relevant type * have been processed, the queued members of failed batches are indexed individually. If a feature * fails this time, that means it cannot be indexed (due to bad data, or a bug in the code). * An error is logged. * * @author rh11 */ public class SmallPopulateLuceneIndices {//implements IndexUpdater { private static Logger logger = Logger.getLogger(SmallPopulateLuceneIndices.class); /** * The number of features to be processed in a single batch. If it's set too * high, we run out of heap space. */ private static final int BATCH_SIZE = 10; /** * Which types of feature to index. */ private static final Collection<Class<? extends Feature>> INDEXED_CLASSES = new ArrayList<Class<? extends Feature>>(); static { INDEXED_CLASSES.add(AbstractGene.class); INDEXED_CLASSES.add(Transcript.class); INDEXED_CLASSES.add(Polypeptide.class); INDEXED_CLASSES.add(Gap.class); // Add feature types here, if a new type of feature should be indexed. // Don't forget to update the class doc comment! } private boolean failFast = false; private ConfigurableGeneDBSessionFactoryBean configurableGeneDBSessionFactoryBean; public ConfigurableGeneDBSessionFactoryBean getConfigurableGeneDBSessionFactoryBean() { return configurableGeneDBSessionFactoryBean; } public void setConfigurableGeneDBSessionFactoryBean( ConfigurableGeneDBSessionFactoryBean configurableGeneDBSessionFactoryBean) { this.configurableGeneDBSessionFactoryBean = configurableGeneDBSessionFactoryBean; } private String indexBaseDirectory; private String organism; private int numBatches = 10; private String hibernateDialect = "org.hibernate.dialect.PostgreSQLDialect"; private String hibernateDriverClass = "org.postgresql.Driver"; /** * Index features of the specified class. First of all indexes the features * in batches, and then retries the failures one-by-one. * * @param featureClass * @param numBatches */ public void indexFeatures(Class<? extends Feature> featureClass, FullTextSession session) { // Transaction transaction = session.beginTransaction(); logger.info(String.format("A. The value of session is '%s' and it is '%s'", session, session.isConnected())); //Set<Integer> failed = batchIndexFeatures(featureClass, -1, session); batchIndexFeatures(featureClass, session); // transaction.commit(); //logger.info("About to close session"); //session.close(); //logger.info("Session closed"); // if (failed.size() > 0) { // logger.info("calling reindex"); // reindexFailedFeatures(failed); // } logger.info("Got to end of indexFeatures(Class)"); } public void indexFeatures() { SessionFactory sessionFactory = null; try { sessionFactory = configurableGeneDBSessionFactoryBean.createFullTextSessionFactory(indexBaseDirectory, 10); } catch (Exception exp) { exp.printStackTrace(); System.exit(65); } logger.info("sessionFactory is "+sessionFactory); Session session = sessionFactory.openSession(); FullTextSession fs = Search.getFullTextSession(session); // session.setFlushMode(FlushMode.MANUAL); // session.setCacheMode(CacheMode.IGNORE); // logger.info(String.format("Just made. The value of session is '%s' and it is '%s'", session, session.isConnected())); // return session; Transaction tx = fs.beginTransaction(); for (Class<? extends Feature> featureClass: INDEXED_CLASSES) { indexFeatures(featureClass, fs); } //fs.getSearchFactory().optimize(Feature.class); tx.commit(); fs.close(); logger.info("Got to end of indexFeatures()"); throw new RuntimeException(); } /** * Attempt to index features in batches. Returns identifiers of the features * that failed to be indexed. (An exception processing a feature will cause * the whole batch to fail, so it's worth trying to reindex failed features * one-by-one.) * * @param featureClass the class of features to index * @param numBatches the number of batches to process. If zero or negative, * process all * @param session * @return a set of featureIds of the features that failed to be indexed */ //@Transactional private void batchIndexFeatures(Class<? extends Feature> featureClass, FullTextSession fullTextSession) { fullTextSession.setFlushMode(FlushMode.MANUAL); fullTextSession.setCacheMode(CacheMode.IGNORE); Transaction transaction = fullTextSession.beginTransaction(); //Scrollable results will avoid loading too many objects in memory Set<Integer> failedToLoad = new HashSet<Integer>(); Criteria criteria = fullTextSession.createCriteria(featureClass); criteria.add(Restrictions.eq("obsolete", false)); // Do not index obsolete features if (organism != null) { criteria.createCriteria("organism") .add( Restrictions.eq("commonName", organism)); } if (numBatches > 0) { criteria.setMaxResults(numBatches * BATCH_SIZE); } ScrollableResults results = criteria.setFetchSize(BATCH_SIZE).scroll(ScrollMode.FORWARD_ONLY); logger.info(String.format("Indexing %s", featureClass)); int thisBatchCount = 0; Set<Integer> thisBatch = new HashSet<Integer>(); int index = 0; while( results.next() ) { Feature feature = (Feature) results.get(0); thisBatch.add(feature.getFeatureId()); boolean failed = false; index++; //try { // logger.debug(String.format("Indexing '%s' (%s)", feature.getUniqueName(), // feature.getClass())); fullTextSession.index( feature ); //index each element //} catch (Exception e) { // logger.error("Batch failed", e); // failed = true; //} // if (failed || ++thisBatchCount == BATCH_SIZE) { // logger.info(String.format("Indexed %d of %s", i, featureClass)); // session.clear(); // thisBatchCount = 0; // if (failed) { // failedToLoad.addAll(thisBatch); // } // thisBatch = new HashSet<Integer>(); // } if (index % BATCH_SIZE == 0) { fullTextSession.flushToIndexes(); //apply changes to indexes fullTextSession.clear(); //clear since the queue is processed } } transaction.commit(); } /* Accessors */ public void setFailFast(boolean failFast) { this.failFast = failFast; } public int getNumBatches() { return numBatches; } public void setNumBatches(int numBatches) { this.numBatches = numBatches; } private void setOrganism(String organism) { this.organism = organism; } public static void main(String[] args) { Cli<PopulateLuceneIndicesArgs> cli = CliFactory.createCli(PopulateLuceneIndicesArgs.class); PopulateLuceneIndicesArgs iga = null; try { iga = cli.parseArguments(args); } catch(ArgumentValidationException exp) { System.err.println("Unable to run:"); System.err.println(cli.getHelpMessage()); exp.printStackTrace(); return; } ConfigurableApplicationContext ctx = new ClassPathXmlApplicationContext( new String[] {"classpath:applicationContext.xml"}); SmallPopulateLuceneIndices indexer = ctx.getBean("smallPopulateLuceneIndices", SmallPopulateLuceneIndices.class); if (iga.isOrganism()) { indexer.setOrganism(iga.getOrganism()); } indexer.setFailFast(iga.getFailFast()); if (iga.isNumBatches()) { indexer.setNumBatches(iga.getNumBatches()); } indexer.setIndexBaseDirectory(iga.getIndexDirectory()); try { indexer.indexFeatures(); } catch (RuntimeException exp) { exp.printStackTrace(); System.err.println("About to go into finally block"); } finally { System.err.println("Going to try to close context"); ctx.close(); } System.err.println("All indexing finished"); Map<Thread, StackTraceElement[]> m = Thread.getAllStackTraces(); for (Entry<Thread, StackTraceElement[]> entry : m.entrySet()) { Thread t = entry.getKey(); if (t.getName().matches("pool-\\d+-thread-\\d+")) { try { t.stop(); } catch (ThreadDeath td) { td.printStackTrace(); throw td; } } } } interface PopulateLuceneIndicesArgs { /* Testing */ @Option(shortName="n", description="Number of batches - only useful for quick-and-dirty testing") int getNumBatches(); void setNumBatches(int numBatches); boolean isNumBatches(); @Option(shortName="f", longName="failFast", description="Fail on second try if there's a problem") boolean getFailFast(); void setFailFast(boolean failFast); boolean isFailFast(); /* What exactly to index */ @Option(shortName="o", description="Only index this organism") String getOrganism(); void setOrganism(String organism); boolean isOrganism(); /* Index location */ @Option(shortName="i", longName="index", description="Directory where the indices are stored") String getIndexDirectory(); } public String getIndexBaseDirectory() { return indexBaseDirectory; } public void setIndexBaseDirectory(String indexBaseDirectory) { this.indexBaseDirectory = indexBaseDirectory; } public String getHibernateDialect() { return hibernateDialect; } public void setHibernateDialect(String hibernateDialect) { this.hibernateDialect = hibernateDialect; } public String getHibernateDriverClass() { return hibernateDriverClass; } public void setHibernateDriverClass(String hibernateDriverClass) { this.hibernateDriverClass = hibernateDriverClass; } }