Corpus.java example

Explorer

tml-master
- tml
  - src
    - main
      - java
        SemanticSpace.java
        TmlCommandLine.java
        tml
        Configuration.java
        annotators
        AbstractAnnotator.java
        Annotator.java
        AnnotatorManager.java
        PennTreeAnnotator.java
        corpus
        Corpus.java
        CorpusParameters.java
        Dictionary.java
        ParagraphCorpus.java
        RepositoryCorpus.java
        SearchResultsCorpus.java
        SentenceCorpus.java
        SimpleCorpus.java
        Term.java
        TextDocument.java
        TextPassage.java
        sql
        DbConnection.java
        storage
        DocumentAnnotator.java
        DocumentCleanup.java
        Repository.java
        RepositoryEvent.java
        RepositoryListener.java
        TmlAnnotatorTask.java
        TmlCleanupTask.java
        TmlIndexerTask.java
        importers
        AbstractImporter.java
        HtmlImporter.java
        Importer.java
        PdfImporter.java
        TextImporter.java
        test
        AbstractTmlIndexingTest.java
        utils
        DBUtils.java
        DistanceLib.java
        Highlighting.java
        JDBCUtils.java
        LanczosSVDLIBCUtils.java
        LanczosSVDPACKCUtils.java
        LuceneUtils.java
        MatrixUtils.java
        RegexUtils.java
        StanfordUtils.java
        Stats.java
        WordNetUtils.java
        vectorspace
        EmptyTextPassageException.java
        NoDocumentsInCorpusException.java
        NotEnoughTermsInCorpusException.java
        SVD.java
        SemanticSpace.java
        TermWeighting.java
        TermWeightingException.java
        factorisation
        MatrixFactorisation.java
        MultiDimensionalScalingNR.java
        NonnegativeMatrixFactorisationED.java
        NonnegativeMatrixFactorisationKL.java
        PrincipalCoordinateAnalysis.java
        ProbabilisticLatentSemanticAnalysis.java
        SingularValueDecomposition.java
        SpaceDecomposition.java
        operations
        AbstractOperation.java
        ClassDiscovery.java
        CompoundNounsSummarized.java
        ConceptExtraction.java
        FactorAnalysisPlot.java
        LastPassage.java
        LexiconAnalysis.java
        Operation.java
        OperationEvent.java
        OperationListener.java
        ParagraphCoherenceIndex.java
        PassageDistances.java
        PassageExtractionSummarization.java
        PassagesSimilarity.java
        RapidAutomaticKeywordExtraction.java
        Readability.java
        RelationshipExtraction.java
        Summary.java
        TagClouds.java
        TermExtractionSummarization.java
        results
        AbstractResult.java
        FactorAnalysisPlotResult.java
        LastPassageResult.java
        LexiconAnalysisResult.java
        NullResult.java
        ParagraphCoherenceIndexResult.java
        PassageClusteringLingoResult.java
        PassageDistancesResult.java
        PassageExtractionSummarizationResult.java
        PassageSimilarityResult.java
        RapidAutomaticKeywordExtractionResult.java
        ReadabilityResult.java
        RelationshipExtractionResult.java
        Summary.java
        SummaryResult.java
        TagCloudsResult.java
        TermRankedResult.java
        TermsExtractionSummarizationResult.java
        summarization
        AbstractSummarizationOperation.java
        LatentSemanticAnalysisSummarization.java
        SummarizationOperation.java
        VectorLengthSummarization.java
        visualizations
        AbstractVisualization.java
        TagClouds.java
        Visualization.java
    - test
      - java
        tml
        test
        DbConnectionTest.java
        FactorAnalysisPlotTest.java
        IndexingDocumentsTest.java
        IndexingHtmlTest.java
        IndexingInvalidDocumentsTest.java
        IndexingPlainTextTest.java
        LanczosTest.java
        LuceneSearchTest.java
        NonNegativeMatrixFactorizationTest.java
        RapidAutomaticKeywordExtractionTest.java
        ReadabilityTest.java
        SimpleCorpusTest.java
        StemmingTest.java
        TagCloudsTest.java
        ValidateBerryDumaisTest.java
        ValidateDistancesTest.java
        ValidateHandbookOfLSATest.java
        ValidateIntroToLSATest.java
        ValidateSameDistancesAllDimensions.java

/*******************************************************************************
 *  Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License. 
 *  You may obtain a copy of the License at 
 *  
 *  	http://www.apache.org/licenses/LICENSE-2.0 
 *  	
 *  Unless required by applicable law or agreed to in writing, software 
 *  distributed under the License is distributed on an "AS IS" BASIS, 
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 *  See the License for the specific language governing permissions and 
 *  limitations under the License.
 *******************************************************************************/
package tml.corpus;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.TreeMap;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.util.Version;

import tml.annotators.Annotator;
import tml.storage.Repository;
import tml.utils.Stats;
import tml.vectorspace.NoDocumentsInCorpusException;
import tml.vectorspace.NotEnoughTermsInCorpusException;
import tml.vectorspace.SemanticSpace;
import tml.vectorspace.TermWeighting;
import tml.vectorspace.TermWeightingException;

import Jama.Matrix;


/**
 * <p>A {@link Corpus} is a set of {@link TextPassage}s
 * that are processed to build a {@link SemanticSpace}.</p>
 * <p>Steps of this process are:</p>
 * <ul>
 * <li>Tokenizing the document, i.e. recognizing terms, URLs, etc.</li>
 * <li>Removing stopwords, like prepositions</li>
 * <li>Stemming</li>
 * <li>Term selection</li>
 * </ul>
 * <p>Once the {@link Corpus} is loaded, it can create a {@link SemanticSpace}
 * using a particular dimensionality reduction technique. For the moment only
 * SVD is implemented, but we expect to implement some others.</p>
 * <p>The following code show how to load a {@link Corpus} and create a
 * {@link SemanticSpace}:</p>
 * <pre>
 * 	...
 * 	corpus.setName("Structure of English"); // A human readable name for the corpus
 * 	corpus.setTermSelectionCriteria(TermSelection.MIN_DF); // Every term must have a minimum document frequency
 * 	corpus.setTermSelectionThreshold(1); // Terms must appear in at least 2 documents
 *	corpus.load(storage); // Load the corpus from the storage
 *	corpus.createSemanticSpace(); // Create an empty semanticSpace
 *
 * 	SemanticSpace space = corpus.getSemanticSpace();
 *	space.setTermWeightScheme(TermWeight.TF); // The term weight scheme will be the raw term frequency
 *	space.setNormalized(true); // The final vectors will be normalized
 *	space.setDimensionalityReduction(DimensionalityReduction.DIMENSIONS_MAX_NUMBER);
 *	space.setDimensionalityReductionThreshold(2); // Number of dimensions to keep on the dimensionality reduction
 *	space.setDimensionsReduced(true); // The dimensions will be reduced
 *	space.calculate(); // Calculate the semantic space
 *	...
 * </pre>
 * 
 * @author Jorge Villalon
 *
 */
public abstract class Corpus implements Cloneable {

	private static final int MAX_DIMENSIONS = 300;

	public class PassageFreqs implements Cloneable {
		private int[] termsIndices;
		private double[] termsFrequencies;
		
		/**
		 * @param termsIndices
		 * @param termsFrequencies
		 */
		public PassageFreqs(int[] termsIndices, double[] termsFrequencies) {
			super();
			this.termsIndices = termsIndices;
			this.termsFrequencies = termsFrequencies;
		}

		@Override
		protected Object clone() throws CloneNotSupportedException {
			PassageFreqs clone = (PassageFreqs) super.clone();
			clone.termsFrequencies = this.termsFrequencies.clone();
			clone.termsIndices = this.termsIndices.clone();
			return clone;
		}

		/**
		 * @return the termsFrequencies
		 */
		public double[] getTermsFrequencies() {
			return termsFrequencies;
		}
		
		/**
		 * @return the termsIndices
		 */
		public int[] getTermsIndices() {
			return termsIndices;
		}
	}
	private static Logger logger = Logger.getLogger(Corpus.class);

	/** Every corpus should have a human readable name */
	private String name;
	/** SemanticSpace created from the corpus */
	protected SemanticSpace space = null;
	/** The time it took the corpus to load */
	protected long processingTime;
	/** The query to search */
	protected String luceneQuery;
	/** The list of terms in the Corpus*/
	protected String[] terms = null;
	/** The Lucene repository where the corpus original documents are stored */
	protected Repository repository;
	/** A class containing all parameters required to create a Corpus and its SemanticSpace */
	protected CorpusParameters parameters = null;
	/** External ids of all the passages (documents, paragraphs or sentences) */
	protected String[] passages = null;
	/** The id of each passage in the Lucene index */
	private int[] passagesLuceneIds = null;
	
	private boolean dbAnnotations = false;
	public boolean isDbAnnotations() {
		return dbAnnotations;
	}

	public void setDbAnnotations(boolean dbAnnotations) {
		this.dbAnnotations = dbAnnotations;
	}

	/**
	 * @return the passagesLuceneIds
	 */
	public int[] getPassagesLuceneIds() {
		return passagesLuceneIds;
	}

	/** Number of non zero values in the term doc matrix */
	protected int nonzeros = 0;
	private boolean projection = false;
	private double[] termEntropies = null;
	private Stats[] termStats = null;
	private Stats[] docStats = null;
	private Matrix termDocs = null;
	private int dimensions = -1;
	/**
	 * @return the projection
	 */
	public boolean isProjection() {
		return projection;
	}

        /**
         * Retrieves the index of the term in the corpus
         * 
         * @param term
         * @return the term index or -1 if not found
         */
        public int getIndexOfTerm(String term) {
            int i = 0;
            for(String t : this.terms) {
                if(term.equals(t))
                    return i;
                i++;
            }
            return -1;
        }

	public String getFilename() {
		return
		//this.getRepository().getIndexPath().replaceAll("[:/\\\\]", "_") + "_" +
		this.getLuceneQuery().replaceAll("\\W", "");		
	}
	/**
	 * @return the termEntropies
	 */
	public double[] getTermEntropies() {
		return termEntropies;
	}

	/**
	 * @param termEntropies the termEntropies to set
	 */
	public void setTermEntropies(double[] termEntropies) {
		this.termEntropies = termEntropies;
	}

	/**
	 * @return the termStats
	 */
	public Stats[] getTermStats() {
		return termStats;
	}

	/**
	 * @param termStats the termStats to set
	 */
	public void setTermStats(Stats[] termStats) {
		this.termStats = termStats;
	}

	/**
	 * @return the docStats
	 */
	public Stats[] getDocStats() {
		return docStats;
	}

	/**
	 * @param docStats the docStats to set
	 */
	public void setDocStats(Stats[] docStats) {
		this.docStats = docStats;
	}

	/**
	 * @param projection the projection to set
	 */
	public void setProjection(boolean projection) {
		this.projection = projection;
	}


	/**
	 * @return the nonzeros
	 */
	public int getNonzeros() {
		return nonzeros;
	}

	protected PassageFreqs[] passageFrequencies = null;

	/**
	 * Constructor for every {@link Corpus}.
	 * @param document the {@link TextDocument} to which the {@link Corpus belongs}
	 */
	public Corpus() {
		this.parameters = new CorpusParameters();
		this.space = new SemanticSpace(this);
	}

	@Override
	protected Object clone() throws CloneNotSupportedException {
		Corpus clone = (Corpus) super.clone();
		clone.space = (SemanticSpace) this.space.clone();
		clone.space.setCorpus(clone);
		clone.passages = this.passages.clone();
		clone.terms = this.terms.clone();
		clone.passageFrequencies = new PassageFreqs[this.passageFrequencies.length];
		for(int i=0;i<clone.passageFrequencies.length;i++) {
			clone.passageFrequencies[i] = (PassageFreqs) this.passageFrequencies[i].clone();
		}
		clone.parameters = (CorpusParameters) this.parameters.clone();
		return clone;
	}

	/**
	 * Returns the string representing the Lucene query used to create the
	 * {@link Corpus}
	 * 
	 * @return the query used to create the {@link Corpus}
	 */
	public String getLuceneQuery() {
		return luceneQuery;
	}

	/**
	 * @return the name of the {@link Corpus} 
	 */
	public String getName() {
		if(this.name == null)
			return this.getLuceneQuery();
		return this.name;
	}
	
	/**
	 * @return the parameters
	 */
	public CorpusParameters getParameters() {
		return parameters;
	} 

	/**
	 * @return the passageFrequencies
	 */
	public PassageFreqs[] getPassageFrequencies() {
		return passageFrequencies;
	}

	/**
	 * @return the passages
	 */
	public String[] getPassages() {
		return passages;
	}

	/**
	 * @return the time it took to load the {@link Corpus}
	 */
	public long getProcessingTime() {
		return processingTime;
	}

	/**
	 * @return the repository
	 */
	public Repository getRepository() {
		return repository;
	}

	/**
	 * @return the {@link SemanticSpace} for the {@link Corpus}
	 */
	public SemanticSpace getSemanticSpace() {
		return this.space;
	}

	/**
	 * @return the raw matrix with the term frequencies for the {@link Corpus}
	 */
	public Matrix getTermDocMatrix() {
		return this.termDocs;
	}

	/**
	 * @return the terms
	 */
	public String[] getTerms() {
		return terms;
	}

	/**
	 * Loads the content of the documents in the query and creates the term-doc
	 * matrix
	 * @param storage the repository to search
	 * 
	 * @throws IOException
	 * @throws NotEnoughTermsInCorpusException
	 * @throws NoDocumentsInCorpusException
	 * @throws TermWeightingException 
	 */
	public void load(Repository repository)
	throws NotEnoughTermsInCorpusException, IOException,
	NoDocumentsInCorpusException, TermWeightingException {

		assert (repository != null);
		
		// If we have enough documents we start creating a dictionary
		this.processingTime = System.currentTimeMillis();

		this.repository = repository;

		logger.debug("Corpus being loaded. Query:" + this.luceneQuery);

		TopFieldDocs hits = searchFullOpenQuery(this.repository, this.luceneQuery);
		ScoreDoc[] docs = hits.scoreDocs;

		// We start with an empty set of documents
		TreeMap<Integer, TextPassage> textPassages = new TreeMap<Integer, TextPassage>();

		// Checking if we got at least one document
		int numDocuments = hits.totalHits;
		logger.debug(numDocuments + " documents found");

		if (numDocuments < 1) {
			logger.error("No documents found in Corpus");
			throw new NoDocumentsInCorpusException();
		}

		Dictionary dictionary = new Dictionary(this);

		ArrayList<Integer> invalidDocuments = new ArrayList<Integer>();

		if (numDocuments > this.parameters.getMaxDocuments())
			numDocuments = this.parameters.getMaxDocuments();

		// For each document in the results
		for (int doc = 0; doc < numDocuments; doc++) {

			int documentId = docs[doc].doc;

			// We must get the terms and term frequencies for the document
			int[] frequencies = null;
			String[] terms = null;

			boolean documentIsEmpty = false;

			try {
				TermFreqVector tfvector = repository.getIndexReader()
				.getTermFreqVector(documentId,
						repository.getLuceneContentField());
				frequencies = tfvector.getTermFrequencies();
				terms = tfvector.getTerms();
			} catch (Exception ex) {
				// If the document has invalid terms or term frequencies we
				// leave it empty
				invalidDocuments.add(documentId);
				frequencies = new int[] { 0 };
				terms = new String[] { "" };
				documentIsEmpty = true;
				String title = repository.getIndexReader().document(documentId)
				.get("title");
				logger.debug("Invalid document found:" + documentId
						+ " ignoring :" + title);
			}

			TextPassage passage = null;

			Document luceneDocument = repository.getIndexSearcher().doc(hits.scoreDocs[doc].doc);
			String content = luceneDocument
			.get(repository.getLuceneContentField());
			String title = luceneDocument.get(repository.getLuceneTitleField());
			String url = luceneDocument.get(repository.getLuceneUrlField());
			String type = luceneDocument.get(repository.getLuceneTypeField());
			String externalId = luceneDocument.get(repository.getLuceneExternalIdField());

			passage = new TextPassage(
					documentId, // The passage's Lucene id
					this, // A link to the corpus where the passage belongs
					content, // The content of the passage 
					title, // The title for the passage
					url, // Url of the text passage (if any)
					type, // The type of the passage
					externalId); // The externalId (in Lucene) of the passage

			// Obtain annotations from the Lucene index and add them to the passage
			for(Annotator annotator : repository.getAnnotators()) {
				String annotation = null;
				annotation = repository.getAnnotations(externalId, annotator.getFieldName());
				if(annotation != null)
					passage.getAnnotations().put(annotator.getFieldName(), annotation);
			}
			// If the document is not empty, we add its terms to the dictionary
			if (!documentIsEmpty)
				dictionary.addTerms(terms, frequencies, passage);

			// We finally add the document to the corpus
			textPassages.put(documentId, passage);
		}

		// Once all the documents were insterted, we remove the terms that don't
		// meet the selection criteria from the dictionary and documents
		dictionary.removeTerms();

		logger.debug(textPassages.size() + " documents processed, "
				+ dictionary.getTerms().size() + " terms kept");

		// We validate that the corpus can be calculated as a SemanticSpace
		if (dictionary.getTerms().size() < textPassages.size() - 1
				|| dictionary.getTerms().size() <= 0) {
			logger.error("Corpus size is invalid!");
			throw new NotEnoughTermsInCorpusException();
		}
		
		this.terms = new String[dictionary.getTerms().size()];
		this.passages = new String[textPassages.size()];
		this.passagesLuceneIds = new int[textPassages.size()];
		this.passageFrequencies = new PassageFreqs[textPassages.size()];
		
		List<String> oldterms = new ArrayList<String>();
		List<String> sortedterms = new ArrayList<String>();
		
		for(Term term : dictionary.getTerms()) {
			this.terms[term.getIndex()] = term.getTerm();
			sortedterms.add(term.getTerm());
		}
		
		for(int i=0;i<this.terms.length;i++) {
			oldterms.add(this.terms[i]);
		}
		
		Collections.sort(sortedterms);
		
		logger.debug("Terms sorted");
		
		int passageIndex = 0;
		for(TextPassage passage : textPassages.values()) {
			this.passages[passageIndex] = passage.getExternalId();
			this.passagesLuceneIds[passageIndex] = passage.getId();
			PassageFreqs pf = new PassageFreqs(
					passage.getTermsCorpusIndices(), 
					passage.getTermFreqs());
			for(int i=0;i<pf.termsIndices.length;i++) {
				int oldindex = pf.termsIndices[i];
				String oldterm = oldterms.get(oldindex);
				int newindex = sortedterms.indexOf(oldterm);
				pf.termsIndices[i] = newindex;
			}
			this.passageFrequencies[passageIndex] = pf;
			passageIndex++;
			nonzeros += pf.termsIndices.length;
		}
		
		logger.debug("Frequencies calculated");
		
		for(int i=0;i<sortedterms.size();i++) {
			this.terms[i] = sortedterms.get(i);
		}

		this.termDocs = getMatrixFromTermFrequencies();
		
		TermWeighting termWeighting = new TermWeighting(this);
		termWeighting.process(this.termDocs);
		
		logger.debug("Term weighting applied");
		
		this.calculateDimensionsToKeep();
		
		this.space.calculate();
		
		this.processingTime = System.currentTimeMillis() - this.processingTime;
		
		logger.info("Corpus " + this.luceneQuery + " loaded in " + this.processingTime + " ms. Parameters:" + this.getParameters());
	}

	private void calculateDimensionsToKeep() {
		int rankS = Math.min(
				this.getPassages().length,
				this.getTerms().length);

		dimensions = 0;
		switch (this.getParameters().getDimensionalityReduction()) {
		case NUM:
			if (this.getParameters().getDimensionalityReductionThreshold() > 0) {
				dimensions = (int) this.getParameters().getDimensionalityReductionThreshold();
			}
			break;
		case VARPCT:
		case PCT:
			int maxDimensions = rankS;
			int numDimensions = (int) Math.round(maxDimensions
					* (this.getParameters().getDimensionalityReductionThreshold() 
							/ 100));
			dimensions = numDimensions;
			break;
		case NO:
			dimensions = rankS;
			break;
		default:
			logger.error("Invalid dimensionality reduction criterion");
		}

		dimensions = Math.max(1, dimensions);
		dimensions = Math.min(rankS, dimensions);
		dimensions = Math.min(MAX_DIMENSIONS, dimensions);		
	}
	
	private Matrix getMatrixFromTermFrequencies() {
		double[][] mdata = new double[this.getTerms().length][this.getPassages().length];
		for(int doc=0;doc<this.getPassages().length;doc++)
			for(int term=0;term<this.getTerms().length;term++)
				mdata[term][doc] = 0;
		int doc=0;
		for(PassageFreqs freqs : this.passageFrequencies) {
			for(int idx=0;idx<freqs.termsIndices.length;idx++) {
				int term = freqs.termsIndices[idx];
				mdata[term][doc] = freqs.termsFrequencies[idx];
			}
			doc++;
		}
		
		return new Matrix(mdata);		
	}
	/**
	 * @return the dimensions
	 */
	public int getDimensions() {
		return dimensions;
	}

	/**
	 * Prints in the console the parameters used in this corpus
	 */
	public String parametersSummary() {
		StringBuffer buff = new StringBuffer();
		buff.append("Name:");
		buff.append(this);
		buff.append("\n");
		buff.append("Query:");
		buff.append(this.getLuceneQuery());
		buff.append("\n");
		buff.append("Processing time:");
		buff.append(this.getProcessingTime());
		buff.append("\n");
		buff.append("Semantic Space:");
		buff.append(this.getSemanticSpace());
		buff.append("\n");
		buff.append("Terms:");
		buff.append(this.getTerms().length);
		buff.append("\n");
		buff.append("Passages:");
		buff.append(this.getPassages().length);
		buff.append("\n");
		return buff.toString();
	}
	
	public String printFrequencies() {
		StringBuffer buff = new StringBuffer();
		buff.append(this.toString());
		buff.append("\n");
		for(int j=0; j<this.getTerms().length; j++) {
			buff.append(this.getTerms()[j]);
			buff.append("\t");
		}
		buff.append("\n");
		for(int i=0; i<this.getPassages().length; i++) {
			PassageFreqs freqs = this.getPassageFrequencies()[i];
			buff.append(this.getPassages()[i]);
			buff.append("\t");
			for(int j=0; j<freqs.getTermsIndices().length; j++) {
				buff.append(this.getTerms()[freqs.getTermsIndices()[j]]);
				buff.append("[");
				buff.append(freqs.getTermsIndices()[j]);
				buff.append("]-(");
				buff.append(freqs.getTermsFrequencies()[j]);
				buff.append(")\t");
			}
			buff.append("\n");
		}
		return buff.toString();
	}
	
	/**
	 * This method projects a {@link Corpus} into another one. The {@link Corpus}
	 * to project is the parameter, and the projected {@link Corpus} is what the
	 * method returns.
	 * The returned {@link Corpus} will have the same {@link Dictionary} than
	 * this {@link Corpus}, and will use the same parameters to calculate its
	 * {@link SemanticSpace}.
	 * 
	 * @param corpusToProject the {@link Corpus} to project
	 * @return the projected {@link Corpus}
	 */
	public Corpus projectCorpus(Corpus corpusToProject) throws Exception {
		
		Corpus projectedCorpus = null;
		
		if(this.space.getSk() == null ||
				this.space.getUk() == null ||
				this.space.getVk() == null) {
			logger.debug("Corpus " + this.luceneQuery + " will be used to project, but hasn't been calculated, calculating...");
			this.space.calculate();
		}
		
		try {
			logger.debug("Projecting corpus:" + corpusToProject.getName() + " on " + this.getName());
			projectedCorpus = (Corpus) corpusToProject.clone();
			projectedCorpus.terms = this.terms.clone();
			projectedCorpus.setName(corpusToProject.getName() + " projected on " + this.getName());
			List<String> termsList = new ArrayList<String>();
			for(int i=0; i<projectedCorpus.getTerms().length; i++)
				termsList.add(projectedCorpus.getTerms()[i]);
			logger.debug("Original corpus had " + corpusToProject.getTerms().length + 
					" terms and " + corpusToProject.getPassages().length + " passages");
			for(int j=0; j<projectedCorpus.passageFrequencies.length; j++) {
				PassageFreqs freqs = projectedCorpus.passageFrequencies[j];
				List<Double> newFreqs = new ArrayList<Double>();
				List<Integer> newIndices = new ArrayList<Integer>();
				
				for(int i=0; i<freqs.termsIndices.length; i++) {
					String term = corpusToProject.getTerms()[freqs.termsIndices[i]];
					double freq = freqs.termsFrequencies[i];
					int newIndex = termsList.indexOf(term);
					freqs.termsIndices[i] = newIndex;
					if(newIndex >= 0) {
						newFreqs.add(freq);
						newIndices.add(newIndex);
						if(newIndex >= projectedCorpus.getTerms().length) {
							throw new Exception("ARGH");
						}
					}
				}
				
				freqs.termsIndices = new int[newIndices.size()];
				freqs.termsFrequencies = new double[newFreqs.size()];
				for(int i=0; i<newIndices.size(); i++) {
					freqs.termsIndices[i] = newIndices.get(i);
					freqs.termsFrequencies[i] = newFreqs.get(i);
				}
				projectedCorpus.passageFrequencies[j] = freqs;
			}
			
			logger.debug("Final corpus has " + projectedCorpus.getTerms().length + 
					" terms and " + projectedCorpus.getPassages().length + " passages");
		} catch (CloneNotSupportedException e) {
			logger.error(e);
			return null;
		}

		Matrix m = projectedCorpus.getMatrixFromTermFrequencies();
		projectedCorpus.termDocs = m;
		projectedCorpus.space = (SemanticSpace) this.space.clone();
		projectedCorpus.getSemanticSpace().setCorpus(projectedCorpus);
		Matrix s = projectedCorpus.getSemanticSpace().getSk();
		Matrix u = projectedCorpus.getSemanticSpace().getUk();

		Matrix ss = new Matrix(s.getRowDimension(), s.getRowDimension());
		for(int i=0;i<s.getRowDimension();i++) {
			if(s.get(i, i) != 0)
				ss.set(i, i, 1/s.get(i, i));
		}
		// Theoretically this produces V
		Matrix v = m.transpose().times(u).times(ss);
		
		projectedCorpus.space.setVk(v);
		
		return projectedCorpus;
	}
	
	/**
	 * <p>
	 * This method searches for whatever you want, full documents, sentences or
	 * paragraphs. All mixed up, so this should only be used by experts that
	 * know how tml uses the Lucene index to store its data.
	 * </p>
	 * <p>
	 * For example, to find all the sentences from a document with external id
	 * "foo"
	 * </p>
	 * 
	 * <pre>
	 * String query = "type:sentence AND reference:foo";
	 * searchFullOpenQuery(query);
	 * </pre>
	 * <p>
	 * It returns a Lucene Hits results because the documents inside can't be
	 * used directly to create a Corpus
	 * </p>
	 * 
	 * @param query
	 *            the Lucene query
	 * @return the search results
	 */
	private TopFieldDocs searchFullOpenQuery(Repository storage, String query) {
		assert (query != null);

		// The query is parsed
		QueryParser parser = new QueryParser(Version.LUCENE_29,
				storage.getLuceneContentField(),
				new KeywordAnalyzer());
		parser.setLowercaseExpandedTerms(false);
		Query documentsQuery = null;
		try {
			documentsQuery = parser.parse(query);
		} catch (ParseException e) {
			e.printStackTrace();
			logger.error(e.toString());
			return null;
		}

		// The index is searched using the query
		TopFieldDocs docs = null;
		try {
			docs = new IndexSearcher(storage.getIndexReader()).search(documentsQuery, null, 9999, Sort.INDEXORDER);
		} catch (Exception e) {
			logger.error(e.toString());
			return null;
		}

		return docs;
	}
	
	/**
	 * @param name the name for the {@link Corpus}
	 */
	public void setName(String name) {
		this.name = name;
	}
	
	/**
	 * @param parameters the parameters to set
	 */
	public void setParameters(CorpusParameters parameters) {
		this.parameters = parameters;
		this.space = new SemanticSpace(this);
	}
	
	/**
	 * Returns the name of the {@link Corpus}.
	 */
	@Override
	public String toString() {
		return this.getName();
	}
}