/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.corpus; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.TreeMap; import org.apache.log4j.Logger; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.util.Version; import tml.annotators.Annotator; import tml.storage.Repository; import tml.utils.Stats; import tml.vectorspace.NoDocumentsInCorpusException; import tml.vectorspace.NotEnoughTermsInCorpusException; import tml.vectorspace.SemanticSpace; import tml.vectorspace.TermWeighting; import tml.vectorspace.TermWeightingException; import Jama.Matrix; /** * <p>A {@link Corpus} is a set of {@link TextPassage}s * that are processed to build a {@link SemanticSpace}.</p> * <p>Steps of this process are:</p> * <ul> * <li>Tokenizing the document, i.e. recognizing terms, URLs, etc.</li> * <li>Removing stopwords, like prepositions</li> * <li>Stemming</li> * <li>Term selection</li> * </ul> * <p>Once the {@link Corpus} is loaded, it can create a {@link SemanticSpace} * using a particular dimensionality reduction technique. For the moment only * SVD is implemented, but we expect to implement some others.</p> * <p>The following code show how to load a {@link Corpus} and create a * {@link SemanticSpace}:</p> * <pre> * ... * corpus.setName("Structure of English"); // A human readable name for the corpus * corpus.setTermSelectionCriteria(TermSelection.MIN_DF); // Every term must have a minimum document frequency * corpus.setTermSelectionThreshold(1); // Terms must appear in at least 2 documents * corpus.load(storage); // Load the corpus from the storage * corpus.createSemanticSpace(); // Create an empty semanticSpace * * SemanticSpace space = corpus.getSemanticSpace(); * space.setTermWeightScheme(TermWeight.TF); // The term weight scheme will be the raw term frequency * space.setNormalized(true); // The final vectors will be normalized * space.setDimensionalityReduction(DimensionalityReduction.DIMENSIONS_MAX_NUMBER); * space.setDimensionalityReductionThreshold(2); // Number of dimensions to keep on the dimensionality reduction * space.setDimensionsReduced(true); // The dimensions will be reduced * space.calculate(); // Calculate the semantic space * ... * </pre> * * @author Jorge Villalon * */ public abstract class Corpus implements Cloneable { private static final int MAX_DIMENSIONS = 300; public class PassageFreqs implements Cloneable { private int[] termsIndices; private double[] termsFrequencies; /** * @param termsIndices * @param termsFrequencies */ public PassageFreqs(int[] termsIndices, double[] termsFrequencies) { super(); this.termsIndices = termsIndices; this.termsFrequencies = termsFrequencies; } @Override protected Object clone() throws CloneNotSupportedException { PassageFreqs clone = (PassageFreqs) super.clone(); clone.termsFrequencies = this.termsFrequencies.clone(); clone.termsIndices = this.termsIndices.clone(); return clone; } /** * @return the termsFrequencies */ public double[] getTermsFrequencies() { return termsFrequencies; } /** * @return the termsIndices */ public int[] getTermsIndices() { return termsIndices; } } private static Logger logger = Logger.getLogger(Corpus.class); /** Every corpus should have a human readable name */ private String name; /** SemanticSpace created from the corpus */ protected SemanticSpace space = null; /** The time it took the corpus to load */ protected long processingTime; /** The query to search */ protected String luceneQuery; /** The list of terms in the Corpus*/ protected String[] terms = null; /** The Lucene repository where the corpus original documents are stored */ protected Repository repository; /** A class containing all parameters required to create a Corpus and its SemanticSpace */ protected CorpusParameters parameters = null; /** External ids of all the passages (documents, paragraphs or sentences) */ protected String[] passages = null; /** The id of each passage in the Lucene index */ private int[] passagesLuceneIds = null; private boolean dbAnnotations = false; public boolean isDbAnnotations() { return dbAnnotations; } public void setDbAnnotations(boolean dbAnnotations) { this.dbAnnotations = dbAnnotations; } /** * @return the passagesLuceneIds */ public int[] getPassagesLuceneIds() { return passagesLuceneIds; } /** Number of non zero values in the term doc matrix */ protected int nonzeros = 0; private boolean projection = false; private double[] termEntropies = null; private Stats[] termStats = null; private Stats[] docStats = null; private Matrix termDocs = null; private int dimensions = -1; /** * @return the projection */ public boolean isProjection() { return projection; } /** * Retrieves the index of the term in the corpus * * @param term * @return the term index or -1 if not found */ public int getIndexOfTerm(String term) { int i = 0; for(String t : this.terms) { if(term.equals(t)) return i; i++; } return -1; } public String getFilename() { return //this.getRepository().getIndexPath().replaceAll("[:/\\\\]", "_") + "_" + this.getLuceneQuery().replaceAll("\\W", ""); } /** * @return the termEntropies */ public double[] getTermEntropies() { return termEntropies; } /** * @param termEntropies the termEntropies to set */ public void setTermEntropies(double[] termEntropies) { this.termEntropies = termEntropies; } /** * @return the termStats */ public Stats[] getTermStats() { return termStats; } /** * @param termStats the termStats to set */ public void setTermStats(Stats[] termStats) { this.termStats = termStats; } /** * @return the docStats */ public Stats[] getDocStats() { return docStats; } /** * @param docStats the docStats to set */ public void setDocStats(Stats[] docStats) { this.docStats = docStats; } /** * @param projection the projection to set */ public void setProjection(boolean projection) { this.projection = projection; } /** * @return the nonzeros */ public int getNonzeros() { return nonzeros; } protected PassageFreqs[] passageFrequencies = null; /** * Constructor for every {@link Corpus}. * @param document the {@link TextDocument} to which the {@link Corpus belongs} */ public Corpus() { this.parameters = new CorpusParameters(); this.space = new SemanticSpace(this); } @Override protected Object clone() throws CloneNotSupportedException { Corpus clone = (Corpus) super.clone(); clone.space = (SemanticSpace) this.space.clone(); clone.space.setCorpus(clone); clone.passages = this.passages.clone(); clone.terms = this.terms.clone(); clone.passageFrequencies = new PassageFreqs[this.passageFrequencies.length]; for(int i=0;i<clone.passageFrequencies.length;i++) { clone.passageFrequencies[i] = (PassageFreqs) this.passageFrequencies[i].clone(); } clone.parameters = (CorpusParameters) this.parameters.clone(); return clone; } /** * Returns the string representing the Lucene query used to create the * {@link Corpus} * * @return the query used to create the {@link Corpus} */ public String getLuceneQuery() { return luceneQuery; } /** * @return the name of the {@link Corpus} */ public String getName() { if(this.name == null) return this.getLuceneQuery(); return this.name; } /** * @return the parameters */ public CorpusParameters getParameters() { return parameters; } /** * @return the passageFrequencies */ public PassageFreqs[] getPassageFrequencies() { return passageFrequencies; } /** * @return the passages */ public String[] getPassages() { return passages; } /** * @return the time it took to load the {@link Corpus} */ public long getProcessingTime() { return processingTime; } /** * @return the repository */ public Repository getRepository() { return repository; } /** * @return the {@link SemanticSpace} for the {@link Corpus} */ public SemanticSpace getSemanticSpace() { return this.space; } /** * @return the raw matrix with the term frequencies for the {@link Corpus} */ public Matrix getTermDocMatrix() { return this.termDocs; } /** * @return the terms */ public String[] getTerms() { return terms; } /** * Loads the content of the documents in the query and creates the term-doc * matrix * @param storage the repository to search * * @throws IOException * @throws NotEnoughTermsInCorpusException * @throws NoDocumentsInCorpusException * @throws TermWeightingException */ public void load(Repository repository) throws NotEnoughTermsInCorpusException, IOException, NoDocumentsInCorpusException, TermWeightingException { assert (repository != null); // If we have enough documents we start creating a dictionary this.processingTime = System.currentTimeMillis(); this.repository = repository; logger.debug("Corpus being loaded. Query:" + this.luceneQuery); TopFieldDocs hits = searchFullOpenQuery(this.repository, this.luceneQuery); ScoreDoc[] docs = hits.scoreDocs; // We start with an empty set of documents TreeMap<Integer, TextPassage> textPassages = new TreeMap<Integer, TextPassage>(); // Checking if we got at least one document int numDocuments = hits.totalHits; logger.debug(numDocuments + " documents found"); if (numDocuments < 1) { logger.error("No documents found in Corpus"); throw new NoDocumentsInCorpusException(); } Dictionary dictionary = new Dictionary(this); ArrayList<Integer> invalidDocuments = new ArrayList<Integer>(); if (numDocuments > this.parameters.getMaxDocuments()) numDocuments = this.parameters.getMaxDocuments(); // For each document in the results for (int doc = 0; doc < numDocuments; doc++) { int documentId = docs[doc].doc; // We must get the terms and term frequencies for the document int[] frequencies = null; String[] terms = null; boolean documentIsEmpty = false; try { TermFreqVector tfvector = repository.getIndexReader() .getTermFreqVector(documentId, repository.getLuceneContentField()); frequencies = tfvector.getTermFrequencies(); terms = tfvector.getTerms(); } catch (Exception ex) { // If the document has invalid terms or term frequencies we // leave it empty invalidDocuments.add(documentId); frequencies = new int[] { 0 }; terms = new String[] { "" }; documentIsEmpty = true; String title = repository.getIndexReader().document(documentId) .get("title"); logger.debug("Invalid document found:" + documentId + " ignoring :" + title); } TextPassage passage = null; Document luceneDocument = repository.getIndexSearcher().doc(hits.scoreDocs[doc].doc); String content = luceneDocument .get(repository.getLuceneContentField()); String title = luceneDocument.get(repository.getLuceneTitleField()); String url = luceneDocument.get(repository.getLuceneUrlField()); String type = luceneDocument.get(repository.getLuceneTypeField()); String externalId = luceneDocument.get(repository.getLuceneExternalIdField()); passage = new TextPassage( documentId, // The passage's Lucene id this, // A link to the corpus where the passage belongs content, // The content of the passage title, // The title for the passage url, // Url of the text passage (if any) type, // The type of the passage externalId); // The externalId (in Lucene) of the passage // Obtain annotations from the Lucene index and add them to the passage for(Annotator annotator : repository.getAnnotators()) { String annotation = null; annotation = repository.getAnnotations(externalId, annotator.getFieldName()); if(annotation != null) passage.getAnnotations().put(annotator.getFieldName(), annotation); } // If the document is not empty, we add its terms to the dictionary if (!documentIsEmpty) dictionary.addTerms(terms, frequencies, passage); // We finally add the document to the corpus textPassages.put(documentId, passage); } // Once all the documents were insterted, we remove the terms that don't // meet the selection criteria from the dictionary and documents dictionary.removeTerms(); logger.debug(textPassages.size() + " documents processed, " + dictionary.getTerms().size() + " terms kept"); // We validate that the corpus can be calculated as a SemanticSpace if (dictionary.getTerms().size() < textPassages.size() - 1 || dictionary.getTerms().size() <= 0) { logger.error("Corpus size is invalid!"); throw new NotEnoughTermsInCorpusException(); } this.terms = new String[dictionary.getTerms().size()]; this.passages = new String[textPassages.size()]; this.passagesLuceneIds = new int[textPassages.size()]; this.passageFrequencies = new PassageFreqs[textPassages.size()]; List<String> oldterms = new ArrayList<String>(); List<String> sortedterms = new ArrayList<String>(); for(Term term : dictionary.getTerms()) { this.terms[term.getIndex()] = term.getTerm(); sortedterms.add(term.getTerm()); } for(int i=0;i<this.terms.length;i++) { oldterms.add(this.terms[i]); } Collections.sort(sortedterms); logger.debug("Terms sorted"); int passageIndex = 0; for(TextPassage passage : textPassages.values()) { this.passages[passageIndex] = passage.getExternalId(); this.passagesLuceneIds[passageIndex] = passage.getId(); PassageFreqs pf = new PassageFreqs( passage.getTermsCorpusIndices(), passage.getTermFreqs()); for(int i=0;i<pf.termsIndices.length;i++) { int oldindex = pf.termsIndices[i]; String oldterm = oldterms.get(oldindex); int newindex = sortedterms.indexOf(oldterm); pf.termsIndices[i] = newindex; } this.passageFrequencies[passageIndex] = pf; passageIndex++; nonzeros += pf.termsIndices.length; } logger.debug("Frequencies calculated"); for(int i=0;i<sortedterms.size();i++) { this.terms[i] = sortedterms.get(i); } this.termDocs = getMatrixFromTermFrequencies(); TermWeighting termWeighting = new TermWeighting(this); termWeighting.process(this.termDocs); logger.debug("Term weighting applied"); this.calculateDimensionsToKeep(); this.space.calculate(); this.processingTime = System.currentTimeMillis() - this.processingTime; logger.info("Corpus " + this.luceneQuery + " loaded in " + this.processingTime + " ms. Parameters:" + this.getParameters()); } private void calculateDimensionsToKeep() { int rankS = Math.min( this.getPassages().length, this.getTerms().length); dimensions = 0; switch (this.getParameters().getDimensionalityReduction()) { case NUM: if (this.getParameters().getDimensionalityReductionThreshold() > 0) { dimensions = (int) this.getParameters().getDimensionalityReductionThreshold(); } break; case VARPCT: case PCT: int maxDimensions = rankS; int numDimensions = (int) Math.round(maxDimensions * (this.getParameters().getDimensionalityReductionThreshold() / 100)); dimensions = numDimensions; break; case NO: dimensions = rankS; break; default: logger.error("Invalid dimensionality reduction criterion"); } dimensions = Math.max(1, dimensions); dimensions = Math.min(rankS, dimensions); dimensions = Math.min(MAX_DIMENSIONS, dimensions); } private Matrix getMatrixFromTermFrequencies() { double[][] mdata = new double[this.getTerms().length][this.getPassages().length]; for(int doc=0;doc<this.getPassages().length;doc++) for(int term=0;term<this.getTerms().length;term++) mdata[term][doc] = 0; int doc=0; for(PassageFreqs freqs : this.passageFrequencies) { for(int idx=0;idx<freqs.termsIndices.length;idx++) { int term = freqs.termsIndices[idx]; mdata[term][doc] = freqs.termsFrequencies[idx]; } doc++; } return new Matrix(mdata); } /** * @return the dimensions */ public int getDimensions() { return dimensions; } /** * Prints in the console the parameters used in this corpus */ public String parametersSummary() { StringBuffer buff = new StringBuffer(); buff.append("Name:"); buff.append(this); buff.append("\n"); buff.append("Query:"); buff.append(this.getLuceneQuery()); buff.append("\n"); buff.append("Processing time:"); buff.append(this.getProcessingTime()); buff.append("\n"); buff.append("Semantic Space:"); buff.append(this.getSemanticSpace()); buff.append("\n"); buff.append("Terms:"); buff.append(this.getTerms().length); buff.append("\n"); buff.append("Passages:"); buff.append(this.getPassages().length); buff.append("\n"); return buff.toString(); } public String printFrequencies() { StringBuffer buff = new StringBuffer(); buff.append(this.toString()); buff.append("\n"); for(int j=0; j<this.getTerms().length; j++) { buff.append(this.getTerms()[j]); buff.append("\t"); } buff.append("\n"); for(int i=0; i<this.getPassages().length; i++) { PassageFreqs freqs = this.getPassageFrequencies()[i]; buff.append(this.getPassages()[i]); buff.append("\t"); for(int j=0; j<freqs.getTermsIndices().length; j++) { buff.append(this.getTerms()[freqs.getTermsIndices()[j]]); buff.append("["); buff.append(freqs.getTermsIndices()[j]); buff.append("]-("); buff.append(freqs.getTermsFrequencies()[j]); buff.append(")\t"); } buff.append("\n"); } return buff.toString(); } /** * This method projects a {@link Corpus} into another one. The {@link Corpus} * to project is the parameter, and the projected {@link Corpus} is what the * method returns. * The returned {@link Corpus} will have the same {@link Dictionary} than * this {@link Corpus}, and will use the same parameters to calculate its * {@link SemanticSpace}. * * @param corpusToProject the {@link Corpus} to project * @return the projected {@link Corpus} */ public Corpus projectCorpus(Corpus corpusToProject) throws Exception { Corpus projectedCorpus = null; if(this.space.getSk() == null || this.space.getUk() == null || this.space.getVk() == null) { logger.debug("Corpus " + this.luceneQuery + " will be used to project, but hasn't been calculated, calculating..."); this.space.calculate(); } try { logger.debug("Projecting corpus:" + corpusToProject.getName() + " on " + this.getName()); projectedCorpus = (Corpus) corpusToProject.clone(); projectedCorpus.terms = this.terms.clone(); projectedCorpus.setName(corpusToProject.getName() + " projected on " + this.getName()); List<String> termsList = new ArrayList<String>(); for(int i=0; i<projectedCorpus.getTerms().length; i++) termsList.add(projectedCorpus.getTerms()[i]); logger.debug("Original corpus had " + corpusToProject.getTerms().length + " terms and " + corpusToProject.getPassages().length + " passages"); for(int j=0; j<projectedCorpus.passageFrequencies.length; j++) { PassageFreqs freqs = projectedCorpus.passageFrequencies[j]; List<Double> newFreqs = new ArrayList<Double>(); List<Integer> newIndices = new ArrayList<Integer>(); for(int i=0; i<freqs.termsIndices.length; i++) { String term = corpusToProject.getTerms()[freqs.termsIndices[i]]; double freq = freqs.termsFrequencies[i]; int newIndex = termsList.indexOf(term); freqs.termsIndices[i] = newIndex; if(newIndex >= 0) { newFreqs.add(freq); newIndices.add(newIndex); if(newIndex >= projectedCorpus.getTerms().length) { throw new Exception("ARGH"); } } } freqs.termsIndices = new int[newIndices.size()]; freqs.termsFrequencies = new double[newFreqs.size()]; for(int i=0; i<newIndices.size(); i++) { freqs.termsIndices[i] = newIndices.get(i); freqs.termsFrequencies[i] = newFreqs.get(i); } projectedCorpus.passageFrequencies[j] = freqs; } logger.debug("Final corpus has " + projectedCorpus.getTerms().length + " terms and " + projectedCorpus.getPassages().length + " passages"); } catch (CloneNotSupportedException e) { logger.error(e); return null; } Matrix m = projectedCorpus.getMatrixFromTermFrequencies(); projectedCorpus.termDocs = m; projectedCorpus.space = (SemanticSpace) this.space.clone(); projectedCorpus.getSemanticSpace().setCorpus(projectedCorpus); Matrix s = projectedCorpus.getSemanticSpace().getSk(); Matrix u = projectedCorpus.getSemanticSpace().getUk(); Matrix ss = new Matrix(s.getRowDimension(), s.getRowDimension()); for(int i=0;i<s.getRowDimension();i++) { if(s.get(i, i) != 0) ss.set(i, i, 1/s.get(i, i)); } // Theoretically this produces V Matrix v = m.transpose().times(u).times(ss); projectedCorpus.space.setVk(v); return projectedCorpus; } /** * <p> * This method searches for whatever you want, full documents, sentences or * paragraphs. All mixed up, so this should only be used by experts that * know how tml uses the Lucene index to store its data. * </p> * <p> * For example, to find all the sentences from a document with external id * "foo" * </p> * * <pre> * String query = "type:sentence AND reference:foo"; * searchFullOpenQuery(query); * </pre> * <p> * It returns a Lucene Hits results because the documents inside can't be * used directly to create a Corpus * </p> * * @param query * the Lucene query * @return the search results */ private TopFieldDocs searchFullOpenQuery(Repository storage, String query) { assert (query != null); // The query is parsed QueryParser parser = new QueryParser(Version.LUCENE_29, storage.getLuceneContentField(), new KeywordAnalyzer()); parser.setLowercaseExpandedTerms(false); Query documentsQuery = null; try { documentsQuery = parser.parse(query); } catch (ParseException e) { e.printStackTrace(); logger.error(e.toString()); return null; } // The index is searched using the query TopFieldDocs docs = null; try { docs = new IndexSearcher(storage.getIndexReader()).search(documentsQuery, null, 9999, Sort.INDEXORDER); } catch (Exception e) { logger.error(e.toString()); return null; } return docs; } /** * @param name the name for the {@link Corpus} */ public void setName(String name) { this.name = name; } /** * @param parameters the parameters to set */ public void setParameters(CorpusParameters parameters) { this.parameters = parameters; this.space = new SemanticSpace(this); } /** * Returns the name of the {@link Corpus}. */ @Override public String toString() { return this.getName(); } }