package org.genedb.web.mvc.model;
import org.genedb.db.analyzers.AllNamesAnalyzer;
import org.gmod.schema.utils.CvTermUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Builds Lucene indices from the Chado database. This is an experiment to
* see how far we can improve on the performance of {@link PopulateLuceneIndices}
* by using direct JDBC and Lucene calls, rather than using Hibernate and
* Hibernate Search.
*
* @author rh11
*
*/
public class LuceneIndexBuilder {
private static final Logger logger = Logger.getLogger(LuceneIndexBuilder.class);
private static final String TRANSCRIPT_INDEX_NAME = "transcript";
public static void main(String[] args)
throws IOException, ClassNotFoundException, SQLException, ProcessingException
{
PropertyConfigurator.configure("resources/classpath/log4j.index.properties");
String jdbcConnectionString = args[0];
String databaseUser = args[1];
String indexDirectory = args[2];
String organismCommonName = args[3];
String databasePassword = new String(System.console().readPassword(
"Password for %s@%s: ",
databaseUser, jdbcConnectionString));
Class.forName("org.postgresql.Driver");
Connection conn = DriverManager.getConnection(
jdbcConnectionString, databaseUser, databasePassword);
conn.setAutoCommit(false); // Autocommit must be off to use cursors
//configureLog4j();
LuceneIndexBuilder luceneIndexBuilder = new LuceneIndexBuilder(new File(indexDirectory));
// There doesn't seem to be anything to be gained by changing this,
// so the option is disabled. The default of 100 offers a good tradeoff.
//
// if (args.length > 4) {
// luceneIndexBuilder.fetchSize = Integer.parseInt(args[4]);
// }
luceneIndexBuilder.addOrganism(conn, organismCommonName);
}
// private static void configureLog4j() {
// String log4jprops = "/log4j.lucene.properties";
// URL url = LuceneIndexBuilder.class.getResource(log4jprops);
// System.out.printf("Configuring Log4J from '%s'\n", url);
// PropertyConfigurator.configure(url);
// }
private File indexDirectory;
// Defaults for new index directories
private Analyzer analyzer = new StandardAnalyzer(); // Is this an appropriate analyzer?
private IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED;
private IndexWriter getIndexWriter(String name)
throws CorruptIndexException, LockObtainFailedException, IOException
{
Directory directory = FSDirectory.getDirectory(new File(indexDirectory, name));
return new IndexWriter(directory, analyzer, maxFieldLength);
}
private Connection conn;
private int fetchSize = 100;
private IndexWriter featureIndexWriter;
public LuceneIndexBuilder(File indexDirectory) {
this.indexDirectory = indexDirectory;
}
public void addOrganism(Connection conn, String commonName)
throws CorruptIndexException, LockObtainFailedException, IOException, SQLException, ProcessingException
{
this.conn = conn;
this.featureIndexWriter = getIndexWriter(TRANSCRIPT_INDEX_NAME);
try {
addGenes(commonName);
} finally {
this.featureIndexWriter.close();
this.featureIndexWriter = null;
}
}
private Set<Integer> typeIds(String... accessions) throws SQLException {
Set<Integer> typeIds = new HashSet<Integer>();
StringBuilder questionMarks = new StringBuilder();
for (int i=0; i < accessions.length; i++) {
if (questionMarks.length() > 0) {
questionMarks.append(", ");
}
questionMarks.append("?");
}
CvTermUtils.checkCvTermPath(conn);
String sql =
"select cvtermpath.subject_id"+
" from cvtermpath" +
" join cvterm type on cvtermpath.type_id = type.cvterm_id"+
" join cvterm object on cvtermpath.object_id = object.cvterm_id"+
" join dbxref object_dbxref on object.dbxref_id = object_dbxref.dbxref_id"+
" join cv object_cv on object.cv_id = object_cv.cv_id"+
" where lower(type.name) = 'is_a'"+
" and object_cv.name = 'sequence'"+
" and object_dbxref.accession in (" + questionMarks + ")";
logger.debug("SQL: " + sql);
PreparedStatement st = conn.prepareStatement(sql);
try {
for (int i=0; i < accessions.length; i++) {
st.setString(1 + i, accessions[i]);
}
ResultSet rs = st.executeQuery();
while (rs.next()) {
typeIds.add(rs.getInt("subject_id"));
}
} finally {
try {
st.close();
} catch (SQLException e) {
logger.error("Error during close()", e);
}
}
return typeIds;
}
private interface ResultSetProcessor {
public void process(GeneInfo geneInfo) throws Exception;
public void noResults() throws Exception;
}
@SuppressWarnings("serial")
private class ProcessingException extends Exception {
public ProcessingException(String message, Throwable cause) {
super(message, cause);
}
}
private class GeneInfo {
int featureId;
String uniqueName;
String name;
boolean isAnalysis;
int fmin;
int fmax;
int strand;
int srcFeatureId;
String srcFeatureUniqueName;
int srcFeatureSeqLen;
List<String> synonyms = new ArrayList<String>();
int organismId;
String organismCommonName;
String organismAbbreviation;
String organismGenus;
String organismSpecies;
int cvTermId;
String cvTermName;
public GeneInfo(ResultSet rs) throws SQLException {
featureId = rs.getInt("feature_id");
uniqueName = rs.getString("uniquename");
name = rs.getString("name");
isAnalysis = rs.getBoolean("is_analysis");
fmin = rs.getInt("fmin");
fmax = rs.getInt("fmax");
strand = rs.getShort("strand");
srcFeatureId = rs.getInt("srcfeature_id");
srcFeatureUniqueName = rs.getString("srcfeature_uniquename");
srcFeatureSeqLen = rs.getInt("srcfeature_seqlen");
String synonym = rs.getString("synonym");
if (synonym != null) {
synonyms.add(synonym);
}
organismId = rs.getInt("organism_id");
organismCommonName = rs.getString("organism_common_name");
organismAbbreviation = rs.getString("organism_abbreviation");
organismGenus = rs.getString("organism_genus");
organismSpecies = rs.getString("organism_species");
cvTermId = rs.getInt("type_cvterm_id");
cvTermName = rs.getString("type_name");
}
}
int serialNumber;
private void processFeatures(String organismCommonName,
Collection<Integer> typeIds, ResultSetProcessor processor)
throws SQLException, ProcessingException
{
if (typeIds.isEmpty()) {
throw new IllegalArgumentException("typeIds is empty");
}
StringBuilder typeIdsCommaSeparated = new StringBuilder();
for(int typeId: typeIds) {
if (typeIdsCommaSeparated.length() > 0) {
typeIdsCommaSeparated.append(", ");
}
typeIdsCommaSeparated.append(typeId);
}
String sql =
"select feature.feature_id"+
" , feature.uniquename"+
" , feature.name"+
" , feature.is_analysis"+
" , featureloc.fmin"+
" , featureloc.fmax"+
" , featureloc.strand"+
" , srcfeature.feature_id as srcfeature_id"+
" , srcfeature.uniquename as srcfeature_uniquename"+
" , srcfeature.seqlen as srcfeature_seqlen"+
" , synonym_sub.name as synonym"+
" , organism.organism_id"+
" , organism.genus as organism_genus"+
" , organism.species as organism_species"+
" , organism.common_name as organism_common_name"+
" , organism.abbreviation as organism_abbreviation"+
" , type.cvterm_id as type_cvterm_id"+
" , type.name as type_name"+
" from feature"+
" join featureloc on feature.feature_id = featureloc.feature_id"+
" join feature srcfeature on featureloc.srcfeature_id = srcfeature.feature_id"+
" left join (" +
" select feature_synonym.feature_id, synonym.name" +
" from feature_synonym join synonym on feature_synonym.synonym_id = synonym.synonym_id" +
" ) synonym_sub on feature.feature_id = synonym_sub.feature_id"+
" join organism on feature.organism_id = organism.organism_id"+
" join cvterm type on feature.type_id = type.cvterm_id"+
" where feature.type_id in (" + typeIdsCommaSeparated + ")"+
" and featureloc.locgroup = 0 and featureloc.rank = 0"+
" and not feature.is_obsolete"+
" and feature.organism_id = ("+
" select organism_id from organism where common_name = ?" +
" )";
logger.debug("SQL: " + sql);
PreparedStatement st = conn.prepareStatement(sql);
st.setFetchSize(fetchSize);
serialNumber = 1;
try {
st.setString(1, organismCommonName);
ResultSet rs = st.executeQuery();
int previousFeatureId = -1;
GeneInfo geneInfo = null;
while (rs.next()) {
int thisFeatureId = rs.getInt("feature_id");
if (thisFeatureId == previousFeatureId) {
geneInfo.synonyms.add(rs.getString("synonym"));
} else {
if (previousFeatureId > 0) {
processFeature(processor, geneInfo);
}
geneInfo = new GeneInfo(rs);
previousFeatureId = thisFeatureId;
}
}
if (previousFeatureId > 0) {
processFeature(processor, geneInfo);
}
} finally {
try {
st.close();
} catch (SQLException e) {
logger.error("Error during close()", e);
}
}
}
/**
* @param processor
* @param rs
* @param n
* @param geneInfo
* @throws ProcessingException
* @throws SQLException
*/
private void processFeature(ResultSetProcessor processor, GeneInfo geneInfo)
throws ProcessingException, SQLException {
try {
if (logger.isTraceEnabled()) {
logger.trace(String.format("[%d] Processing gene '%s'",
serialNumber++, geneInfo.uniqueName));
}
processor.process(geneInfo);
} catch (Exception e) {
throw new ProcessingException(
String.format("Error processing feature '%s' (ID=%d)",
geneInfo.uniqueName, geneInfo.featureId), e);
}
}
Set<Integer> processedGeneIds;
private void addGenes(final String commonName)
throws CorruptIndexException, IOException, SQLException, ProcessingException
{
Set<Integer> geneTypeIds = typeIds("0000704", "0000336");
processedGeneIds = new HashSet<Integer>();
processFeatures(commonName, geneTypeIds, new ResultSetProcessor() {
@Override
public void process(GeneInfo geneInfo)
throws SQLException, CorruptIndexException, IOException {
if (processedGeneIds.contains(geneInfo.featureId)) {
logger.error(String.format("Gene '%s' (ID=%d) already processed"));
return;
}
processedGeneIds.add(geneInfo.featureId);
Document doc = documentForGene(geneInfo);
featureIndexWriter.addDocument(doc);
}
@Override
public void noResults() {
logger.error(String.format("No genes found for '%s' - check spelling", commonName));
}
});
featureIndexWriter.close();
}
private static Analyzer allNamesAnalyzer = new AllNamesAnalyzer();
private Document documentForGene(GeneInfo geneInfo) {
Document doc = new Document();
StringBuilder synonymsAsTabSeparatedString = new StringBuilder();
for (String synonym: geneInfo.synonyms) {
if (synonymsAsTabSeparatedString.length() > 0) {
synonymsAsTabSeparatedString.append('\t');
}
synonymsAsTabSeparatedString.append(synonym);
}
StringBuilder allNames = new StringBuilder();
if (geneInfo.name != null) {
allNames.append(geneInfo.name + ' ');
}
allNames.append(geneInfo.uniqueName + ' ');
allNames.append(synonymsAsTabSeparatedString);
TokenStream allNamesTokenized = allNamesAnalyzer.tokenStream(
"allNames", new StringReader(allNames.toString()));
doc.add(new Field("featureId", Integer.toString(geneInfo.featureId),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("uniqueName", geneInfo.uniqueName,
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("name", geneInfo.name == null ? "" : geneInfo.name,
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("analysis", Boolean.toString(geneInfo.isAnalysis),
Field.Store.NO, Field.Index.NOT_ANALYZED));
doc.add(new Field("synonym", synonymsAsTabSeparatedString.toString(),
Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("allNames", allNamesTokenized));
doc.add(new Field("start", String.format("%09d", geneInfo.fmin),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("stop", String.format("%09d", geneInfo.fmax),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("strand", Integer.toString(geneInfo.strand),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("chr", geneInfo.srcFeatureUniqueName,
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("chrId", Integer.toString(geneInfo.srcFeatureId),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("chrlen", Integer.toString(geneInfo.srcFeatureSeqLen),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("organism.organismId", Integer.toString(geneInfo.organismId),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("organism.commonName", geneInfo.organismCommonName,
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("organism.abbreviation", geneInfo.organismAbbreviation,
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("organism.genus", geneInfo.organismGenus,
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("organism.species", geneInfo.organismSpecies,
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("type.cvTermId", Integer.toString(geneInfo.cvTermId),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("type.name", geneInfo.cvTermName,
Field.Store.YES, Field.Index.NOT_ANALYZED));
// protein
return doc;
}
}