package org.genedb.web.mvc.model;
import org.genedb.db.audit.ChangeSet;
import org.gmod.schema.feature.AbstractGene;
import org.gmod.schema.feature.Gap;
import org.gmod.schema.feature.Gene;
import org.gmod.schema.feature.Polypeptide;
import org.gmod.schema.feature.ProductiveTranscript;
import org.gmod.schema.feature.Transcript;
import org.gmod.schema.mapped.Feature;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.hibernate.CacheMode;
import org.hibernate.FlushMode;
import org.hibernate.Query;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.hibernate.Transaction;
import org.hibernate.search.FullTextSession;
import org.hibernate.search.Search;
import org.hibernate.search.SearchFactory;
import org.hibernate.search.reader.ReaderProvider;
import org.hibernate.search.store.DirectoryProvider;
import org.springframework.context.ConfigurableApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
import uk.co.flamingpenguin.jewel.cli.ArgumentValidationException;
import uk.co.flamingpenguin.jewel.cli.Cli;
import uk.co.flamingpenguin.jewel.cli.CliFactory;
import uk.co.flamingpenguin.jewel.cli.Option;
import java.io.Console;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.google.common.collect.Sets;
/**
* Create Lucene indices.
* <p>
* The way it works is as follows:
* A list of feature_ids is generated that must be deleted and/or updated. This can be done by:
* (i) Indexing by type. Each indexed feature type is treated in turn; currently
* the classes <code>AbstractGene</code>, <code>Transcript</code> and
* <code>Gap</code> are indexed (in that order). For each type, all features of that type
* are listed
* (ii) The first option can be limited by a given organism
* (iii) A list of features can be provided through a <code>ChangeSet</code>
*
* Given this list, the features are loaded and indexed in batches of 10.
* If an exception is thrown while indexing a particular feature, the exception is caught and
* the whole batch will fail.
* The members of the failed batch are then put into a queue. When all batches of the relevant type
* have been processed, the queued members of failed batches are indexed individually. If a feature
* fails this time, that means it cannot be indexed (due to bad data, or a bug in the code).
* An error is logged.
*
* @author rh11
*/
public class PopulateLuceneIndices implements IndexUpdater {
@Override
public int updateTranscriptCache(ChangeSet changeSet) throws Exception {
// TODO Auto-generated method stub
return 0;
}
private static Logger logger = Logger.getLogger(PopulateLuceneIndices.class);
/**
* The number of features to be processed in a single batch. If it's set too
* high, we run out of heap space.
*/
private static final int DEFAULT_BATCH_SIZE = 10;
private int batchSize = DEFAULT_BATCH_SIZE;
/**
* Which types of feature to index.
*/
private static final Collection<Class<? extends Feature>> INDEXED_CLASSES
= new ArrayList<Class<? extends Feature>>();
static {
INDEXED_CLASSES.add(Gap.class);
// Add feature types here, if a new type of feature should be indexed.
// Don't forget to update the class doc comment!
}
private boolean failFast = false;
private ConfigurableGeneDBSessionFactoryBean configurableGeneDBSessionFactoryBean;
public ConfigurableGeneDBSessionFactoryBean getConfigurableGeneDBSessionFactoryBean() {
return configurableGeneDBSessionFactoryBean;
}
public void setConfigurableGeneDBSessionFactoryBean(
ConfigurableGeneDBSessionFactoryBean configurableGeneDBSessionFactoryBean) {
this.configurableGeneDBSessionFactoryBean = configurableGeneDBSessionFactoryBean;
}
private String indexBaseDirectory;
private String organism;
private int numBatches = -1;
private int featureStart = -1;
private int featureEnd;
private String hibernateDialect = "org.hibernate.dialect.PostgreSQLDialect";
private String hibernateDriverClass = "org.postgresql.Driver";
public PopulateLuceneIndices() {
// Default constructor
}
/**
* Create a new FullTextSession, configured with the supplied sessionFactory.
*
* @param batchSize
* @return
* @throws Exception
*/
private FullTextSession newSession(int batchSize) {
SessionFactory sessionFactory = null;
try {
sessionFactory = configurableGeneDBSessionFactoryBean.createFullTextSessionFactory(indexBaseDirectory, batchSize);
} catch (Exception exp) {
exp.printStackTrace();
System.exit(65);
}
logger.info("sessionFactory is "+sessionFactory);
Session basicSession = sessionFactory.openSession();
FullTextSession session = Search.getFullTextSession(basicSession);
session.setFlushMode(FlushMode.MANUAL);
session.setCacheMode(CacheMode.IGNORE);
logger.info(String.format("Just made. The value of session is '%s' and it is '%s'", session, session.isConnected()));
return session;
}
/**
* Index features of the specified class. First of all indexes the features
* in batches, and then retries the failures one-by-one.
*
* @param featureClass
* @param numBatches
*/
public void indexFeatures(Class<? extends Feature> featureClass, int numBatches, FullTextSession session) {
//Transaction transaction = session.beginTransaction();
Set<Integer> failed = batchIndexFeatures(featureClass, numBatches, session);
//transaction.commit();
//session.close();
if (failed.size() > 0) {
reindexFailedFeatures(failed);
}
logger.trace("Leaving indexFeatures ("+featureClass+")");
}
public void indexFeatures() {
FullTextSession session = newSession(batchSize);
indexGenes(session);
for (Class<? extends Feature> featureClass: INDEXED_CLASSES) {
indexFeatures(featureClass, numBatches, session);
}
session.close();
logger.trace("Leaving indexFeatures");
}
private Set<Integer> indexGenes(FullTextSession session) {
Set<Integer> failedToLoad = new HashSet<Integer>();
String hql = "select featureId from AbstractGene where obsolete=false";
if (featureStart > 0) {
hql += " and featureId >= "+featureStart;
}
if (featureEnd > 0) {
hql += " and featureId < "+featureEnd;
}
if (organism != null) {
hql += " and organism.commonName = '"+organism +"'";
}
Query idQuery = session.createQuery(hql);
logger.info("Indexing AbstractGenes");
@SuppressWarnings("unchecked")
List<Integer> allIds = idQuery.list();
int batchCount = 0;
int start = 0;
int end = start + batchSize;
//long startTime = new Date().getTime();
while (start < allIds.size()) {
if (end > allIds.size()) {
end = allIds.size();
}
Transaction transaction = session.beginTransaction();
List<Integer> thisBatch = allIds.subList(start, end);
String ids = StringUtils.collectionToCommaDelimitedString(thisBatch);
logger.debug(String.format("The list of ids being looked up is '%s'", ids));
Query featureQuery = session.createQuery("from Feature where featureId in ("+ids+")");
if (numBatches > 0) {
featureQuery.setMaxResults(numBatches * batchSize);
}
//featureQuery.setMaxResults(BATCH_SIZE);
@SuppressWarnings("unchecked")
List<AbstractGene> genes = featureQuery.list();
boolean failed = false;
int i=0;
for (AbstractGene gene : genes) {
i++;
try {
logger.info(String.format("Indexing '%s' (%s)", gene.getUniqueName(), gene.getClass()));
session.index(gene);
for (Transcript transcript : gene.getTranscripts()) {
if (transcript.isObsolete()) {
continue;
}
logger.info(String.format("-Indexing '%s' (%s)", transcript.getUniqueName(), transcript.getClass()));
session.index(transcript);
if (transcript instanceof ProductiveTranscript) {
ProductiveTranscript productiveTranscript = (ProductiveTranscript) transcript;
Polypeptide protein = productiveTranscript.getProtein();
if (protein != null) {
logger.info(String.format("--Indexing '%s' (%s)", protein.getUniqueName(), protein.getClass()));
session.index(protein);
}
}
}
} catch (Exception exp) {
logger.error("Batch failed", exp);
failed = true;
}
if ((i % 10) == 0) {
logger.debug('.');
}
}
batchCount++;
//logger.info(String.format("Indexed '%d'pc ('%d' of '%d'), %d hours, %d mins left", (batchCount*batchSize)*100/allIds.size(),batchCount*batchSize, allIds.size(), hours, mins));
logger.debug(String.format("Indexed '%d' percent ('%d' of '%d')", (batchCount*batchSize)*100/allIds.size(),batchCount*batchSize, allIds.size()));
if (failed) {
failedToLoad.addAll(thisBatch);
} else {
try {
session.flushToIndexes();
}
catch (Exception exp) {
exp.printStackTrace(System.err);
System.exit(1);
}
}
session.clear();
start = end;
end = start + batchSize;
transaction.commit();
}
logger.trace("Leaving batchIndexFeatures");
return failedToLoad;
}
/**
* Attempt to index features in batches. Returns identifiers of the features
* that failed to be indexed. (An exception processing a feature will cause
* the whole batch to fail, so it's worth trying to reindex failed features
* one-by-one.)
*
* @param featureClass the class of features to index
* @param numBatches the number of batches to process. If zero or negative,
* process all
* @param session
* @return a set of featureIds of the features that failed to be indexed
*/
@Transactional
private Set<Integer> batchIndexFeatures(Class<? extends Feature> featureClass,
int numBatches, FullTextSession session) {
Set<Integer> failedToLoad = new HashSet<Integer>();
String hql = "select featureId from "+featureClass.getName()+" where obsolete=false";
if (organism != null) {
hql += " and organism.commonName = '"+organism +"'";
}
Query idQuery = session.createQuery(hql);
//if (numBatches > 0) {
// q.setMaxResults(numBatches * BATCH_SIZE);
//}
//q.setMaxResults(BATCH_SIZE);
logger.info(String.format("Indexing %s", featureClass));
@SuppressWarnings("unchecked") List<Integer> allIds = idQuery.list();
int batchCount = 0;
int start = 0;
int end = start + batchSize;
while (start < allIds.size()) {
if (end > allIds.size()) {
end = allIds.size();
}
List<Integer> thisBatch = allIds.subList(start, end);
String ids = StringUtils.collectionToCommaDelimitedString(thisBatch);
Query featureQuery = session.createQuery("from "+featureClass.getName()+" where featureId in ("+ids+")");
@SuppressWarnings("unchecked") List<Feature> features = featureQuery.list();
boolean failed = false;
int i=0;
for (Feature feature : features) {
i++;
try {
logger.debug(String.format("Indexing '%s' (%s)", feature.getUniqueName(), feature.getClass()));
session.index(feature);
batchCount++;
logger.debug(String.format("Indexed '%d' ('%d' of '%d') of type '%s'", (batchCount*batchSize)*100/allIds.size(),batchCount*batchSize, allIds.size(), featureClass));
} catch (Exception exp) {
//System.err.println(exp);
logger.error("Batch failed", exp);
failed = true;
}
if ((i % 10) == 0) {
logger.warn('.');
}
}
if (failed) {
logger.warn("Adding failed to batch");
failedToLoad.addAll(thisBatch);
} else {
logger.debug("About to flush to indices");
try {
session.flushToIndexes();
}
catch (Exception exp) {
exp.printStackTrace(System.err);
System.exit(1);
}
logger.debug("Just flushed to indices");
}
logger.debug("About to clear session");
session.clear();
logger.debug("Just cleared session");
start = end;
end = start + batchSize;
}
logger.trace("Leaving batchIndexFeatures");
return failedToLoad;
}
/**
* Attempt to index the provided features individually
* (i.e. in batches of one). Used to reindex failures
* from a batch indexing run.
*
* @param failed a set of features to reindex
* @throws Exception
*/
private void reindexFailedFeatures(Set<Integer> failed) {
logger.info("Attempting to reindex failed features");
FullTextSession session = newSession(1);
Transaction transaction = session.beginTransaction();
for (int featureId : failed) {
logger.debug(String.format("Attempting to index feature %d", featureId));
Feature feature = (Feature) session.load(Feature.class, featureId);
logger.debug(String.format("Loaded feature '%s'", feature.getUniqueName()));
try {
session.index(feature);
logger.debug("Feature successfully indexed");
} catch (Exception exp) {
String msg = String.format("Failed to index feature '%s' on the second attempt", feature.getUniqueName());
if (failFast) {
throw new RuntimeException(msg, exp);
}
logger.info(msg, exp);
}
session.clear();
}
transaction.commit();
session.close();
}
public void indexFeatures(List<Integer> featureIds) {
FullTextSession session = newSession(batchSize);
//Transaction transaction = session.beginTransaction();
Set<Integer> failed = batchIndexFeatures(featureIds, session);
//transaction.commit();
session.close();
if (failed.size() > 0) {
reindexFailedFeatures(failed);
}
logger.trace("Leaving indexFeatures");
}
public boolean updateAllCaches(ChangeSet changeSet) {
// Ignore changes to top level feature
try {
// Let's process deletes first
Set<Integer> deletedIds = Sets.newHashSet();
deletedIds.addAll(changeSet.deletedFeatureIds(Gene.class));
deletedIds.addAll(changeSet.deletedFeatureIds(Transcript.class));
deletedIds.addAll(changeSet.deletedFeatureIds(Polypeptide.class));
deletedIds.addAll(changeSet.deletedFeatureIds(Gap.class));
deleteFromIndex(deletedIds);
// Now adds and updates
Set<Integer> alteredIds = Sets.newHashSet();
alteredIds.addAll(changeSet.newFeatureIds(Gene.class));
alteredIds.addAll(changeSet.changedFeatureIds(Gene.class));
alteredIds.addAll(changeSet.newFeatureIds(Transcript.class));
alteredIds.addAll(changeSet.changedFeatureIds(Transcript.class));
alteredIds.addAll(changeSet.newFeatureIds(Polypeptide.class));
alteredIds.addAll(changeSet.changedFeatureIds(Polypeptide.class));
alteredIds.addAll(changeSet.newFeatureIds(Gap.class));
alteredIds.addAll(changeSet.changedFeatureIds(Gap.class));
FullTextSession session = newSession(batchSize);
//Transaction transaction = session.beginTransaction();
Set<Integer> failed = batchIndexFeatures(alteredIds, session);
//transaction.commit();
session.close();
if (failed.size() > 0) {
reindexFailedFeatures(failed);
}
}
catch (IOException exp) {
logger.error("Failed to update Lucene indices", exp);
return false;
}
return true;
}
/**
* Delete all the given ids from the index
*
* @param ids the list of feature ids
* @throws IOException
*/
private void deleteFromIndex(Collection<Integer> ids) throws IOException {
FullTextSession session = newSession(10);
SearchFactory searchFactory = session.getSearchFactory();
ReaderProvider rp = searchFactory.getReaderProvider();
DirectoryProvider<?>[] directoryProviders = searchFactory.getDirectoryProviders(Feature.class);
if (directoryProviders == null || directoryProviders.length < 1) {
throw new RuntimeException("Unable to open a directory provider");
}
IndexReader reader = rp.openReader(directoryProviders);
for (Integer id : ids) {
reader.deleteDocuments(new Term("featureId", Integer.toString(id)));
}
rp.closeReader(reader);
session.close();
}
/**
* Attempt to index features in batches. Returns identifiers of the features
* that failed to be indexed. (An exception processing a feature will cause
* the whole batch to fail, so it's worth trying to reindex failed features
* one-by-one.)
*
* @param featureClass the class of features to index
* @param numBatches the number of batches to process. If zero or negative,
* process all
* @param session
* @return a set of featureIds of the features that failed to be indexed
*/
@Transactional
private Set<Integer> batchIndexFeatures(Collection<Integer> featureIds,
FullTextSession session) {
logger.info(String.format("C. The value of session is '%s' and it is '%s'", session, session.isConnected()));
Set<Integer> failedToLoad = new HashSet<Integer>();
int thisBatchCount = 0;
Set<Integer> thisBatch = new HashSet<Integer>();
int i = 0;
for (Integer featureId : featureIds) {
Feature feature = (Feature) session.load(Feature.class, featureId);
thisBatch.add(featureId);
boolean failed = false;
try {
logger.debug(String.format("Indexing '%s' (%s)", feature.getUniqueName(),
feature.getClass()));
session.index(feature);
} catch (Exception exp) {
logger.error("Batch failed", exp);
failed = true;
}
if (failed || ++thisBatchCount == batchSize) {
logger.debug(String.format("Indexed %d of %d", i, featureIds.size()));
session.clear();
thisBatchCount = 0;
if (failed) {
failedToLoad.addAll(thisBatch);
}
thisBatch = new HashSet<Integer>();
}
i++;
}
logger.info(String.format("C. The value of session is '%s' and it is '%s'", session, session.isConnected()));
return failedToLoad;
}
/* Accessors */
public void setFailFast(boolean failFast) {
this.failFast = failFast;
}
public int getNumBatches() {
return numBatches;
}
public void setNumBatches(int numBatches) {
this.numBatches = numBatches;
}
private void setOrganism(String organism) {
this.organism = organism;
}
public static String promptForPassword(String databaseUrl, String databaseUsername) {
Console console = System.console();
if (console == null) {
System.err.println("No password has been supplied, and no console found\n");
System.exit(1);
return ""; // Dummy to prevent null warning
}
char[] password = null;
while (password == null) {
password = console.readPassword("Password for %s@%s: ", databaseUsername, databaseUrl);
}
return new String(password);
}
interface PopulateLuceneIndicesArgs {
/* Testing */
@Option(shortName="n", description="Number of batches - only useful for quick-and-dirty testing")
int getNumBatches();
void setNumBatches(int numBatches);
boolean isNumBatches();
@Option(shortName="f", longName="failFast", description="Fail on second try if there's a problem")
boolean getFailFast();
void setFailFast(boolean failFast);
boolean isFailFast();
/* What exactly to index */
@Option(shortName="o", description="Only index this organism")
String getOrganism();
void setOrganism(String organism);
boolean isOrganism();
/* Index location */
@Option(shortName="i", longName="index", description="Directory where the indices are stored")
String getIndexDirectory();
/* Batch size */
@Option(shortName="b", description="(Optional) batch size")
int getBatchSize();
void setBatchSize(int batchSize);
boolean isBatchSize();
/* Feature start */
@Option(shortName="s", description="(Optional) featureId start")
int getFeatureStart();
void setFeatureStart(int featureStart);
boolean isFeatureStart();
/* Feature end */
@Option(shortName="e", description="(Optional) featureId end")
int getFeatureEnd();
void setFeatureEnd(int featureEnd);
boolean isFeatureEnd();
}
public String getIndexBaseDirectory() {
return indexBaseDirectory;
}
public void setIndexBaseDirectory(String indexBaseDirectory) {
this.indexBaseDirectory = indexBaseDirectory;
}
public String getHibernateDialect() {
return hibernateDialect;
}
public void setHibernateDialect(String hibernateDialect) {
this.hibernateDialect = hibernateDialect;
}
public String getHibernateDriverClass() {
return hibernateDriverClass;
}
public void setHibernateDriverClass(String hibernateDriverClass) {
this.hibernateDriverClass = hibernateDriverClass;
}
public void setBatchSize(int batchSize) {
this.batchSize = batchSize;
}
public static void main(String[] args) {
PropertyConfigurator.configure("resources/classpath/log4j.index.properties");
Cli<PopulateLuceneIndicesArgs> cli = CliFactory.createCli(PopulateLuceneIndicesArgs.class);
PopulateLuceneIndicesArgs iga = null;
try {
iga = cli.parseArguments(args);
}
catch(ArgumentValidationException exp) {
System.err.println("Unable to run:");
System.err.println(cli.getHelpMessage());
exp.printStackTrace();
return;
}
ConfigurableApplicationContext ctx = new ClassPathXmlApplicationContext(
new String[] {"classpath:applicationContext.xml"});
PopulateLuceneIndices indexer = ctx.getBean("populateLuceneIndices", PopulateLuceneIndices.class);
if (iga.isOrganism()) {
indexer.setOrganism(iga.getOrganism());
}
indexer.setFailFast(iga.getFailFast());
if (iga.isNumBatches()) {
indexer.setNumBatches(iga.getNumBatches());
}
if (iga.isBatchSize()) {
indexer.setBatchSize(iga.getBatchSize());
}
if (iga.isFeatureStart()) {
indexer.setFeatureStart(iga.getFeatureStart());
}
if (iga.isFeatureEnd()) {
indexer.setFeatureEnd(iga.getFeatureEnd());
}
indexer.setIndexBaseDirectory(iga.getIndexDirectory());
indexer.indexFeatures();
logger.trace("Leaving main");
System.exit(0);
}
public void setFeatureStart(int featureStart) {
this.featureStart = featureStart;
}
public void setFeatureEnd(int featureEnd) {
this.featureEnd = featureEnd;
}
}