/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.corpus;
import java.util.Collection;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.log4j.Logger;
import tml.utils.Stats;
/**
* This class represents a text passage, that is part of a {@link Corpus}. It
* can be a sentence, paragraph, a complete document or any other piece of text
* of any length.
*
* @author Jorge Villalon
*
*/
public class TextPassage {
/**
* This class represents the statistics for a {@link TextPassage}. Basically
* how many total terms and how many different terms.
*
* @author Jorge Villalon
*
*/
public class TextPassageStats {
private Stats stats;
/**
* How many different terms the {@link TextPassage} contains
*
* @return the number of different terms
*/
public int getDifferentTerms() {
return (int) this.stats.count;
}
/**
* @return the statistics for the {@link TextPassage}
*/
public Stats getStats() {
return stats;
}
/**
* @return the total number of terms in a {@link TextPassage}
*/
public int getTotalTerms() {
return (int) this.stats.sum;
}
/**
* Loads a set of term frequencies into the passage
*
* @param termsFrequencies
* an array with the term frequencies
*/
public void load(int[] termsFrequencies) {
this.stats = new Stats();
for (int i = 0; i < termsFrequencies.length; i++) {
if (termsFrequencies[i] > 0)
this.stats.add(termsFrequencies[i]);
}
this.stats.calculateDerived();
}
}
/** The log4j logger */
private static Logger logger = Logger.getLogger(TextPassage.class);
/** A unique id */
private int id;
/** The external id to identify it in Lucene */
private String externalId;
/**
* @return the externalId
*/
public String getExternalId() {
return externalId;
}
/** The statistics of the passage */
private Stats stats;
/** The raw term frequencies */
private double[] termsFreqs = null;
/** The term indices in the dictionary */
private int[] termsIndices = null;
/** A map with the sorted terms */
private TreeMap<Integer, Term> terms;
/** A hash table with the term frequencies */
private Hashtable<Integer, Integer> termFrequencies;
/** The corpus to which the passage belongs */
private Corpus corpus = null;
/** The content of the TextPassage */
private String content;
/** A human readable title for the passage */
private String title;
/** URL of the passage */
private String url;
/** The type of the passage (doc, parag, senten) */
private String type;
/** Annotations obtained from the Lucene index */
private Hashtable<String, String> annotations = null;
/**
* Creates a new instance of a {@link TextPassage}.
*
* @param id the id of the passage
* @param title the title for the passage
* @param corpus the {@link Corpus} to which the passage belongs
* @param content the content of the passage
* @param url the url for the passage
* @param type the type of the passage (document, paragraph or sentence)
* @param externalId Lucene id of the passage
*/
public TextPassage(int id, Corpus corpus, String content, String title,
String url, String type, String externalId) {
assert (corpus != null);
assert (id >= 0);
this.id = id;
this.corpus = corpus;
this.title = title;
this.content = content;
this.url = url;
this.type = type;
this.externalId = externalId;
// Initialising containers
this.stats = new Stats();
this.terms = new TreeMap<Integer, Term>();
this.termFrequencies = new Hashtable<Integer, Integer>();
this.annotations = new Hashtable<String, String>();
}
/**
* @return the annotations
*/
public Hashtable<String, String> getAnnotations() {
return annotations;
}
/**
* Adds a {@link Term} to the passage, it adds a number to the statistics
* but it doesn't calculate the final values
*
* @param term
* @param frequency
*/
public void addTerm(Term term, int frequency) {
this.terms.put(term.getIndex(), term);
this.termFrequencies.put(term.getIndex(), frequency);
this.stats.add(frequency);
}
/**
* Calculates the packed arrays for terms and frequencies
*/
private void calculate() {
this.termsFreqs = new double[this.terms.size()];
this.termsIndices = new int[this.terms.size()];
Iterator<Term> it = this.terms.values().iterator();
for (int i = 0; it.hasNext(); i++) {
Term term = it.next();
this.termsFreqs[i] = this.termFrequencies.get(term.getIndex());
this.termsIndices[i] = term.getIndex();
if (Double.isNaN(this.termsFreqs[i])
|| Double.isInfinite(this.termsFreqs[i])) {
this.termsFreqs[i] = 0;
logger.error("Invalid frequency, setting to 0");
}
}
}
/**
* @return the content of the passage
*/
public String getContent() {
return content;
}
/**
* @return the {@link Corpus} to which the passage belongs
*/
public Corpus getCorpus() {
return corpus;
}
/**
* @return the external id of a passage
*/
public int getId() {
return id;
}
/**
* @return basic statistics for the passage
*/
public Stats getStats() {
return stats;
}
/**
* @return a packed array with the frequencies of the passage terms
*/
public double[] getTermFreqs() {
if (termsFreqs == null) {
try {
this.calculate();
} catch (Exception e) {
e.printStackTrace();
this.termsFreqs = null;
}
}
return termsFreqs;
}
/**
* @return all the {@link Term}s in the passage
*/
public Collection<Term> getTerms() {
return this.terms.values();
}
/**
* @return an array of indices of the terms within the passage
*/
public int[] getTermsCorpusIndices() {
if (this.termsIndices == null) {
try {
this.calculate();
} catch (Exception e) {
e.printStackTrace();
this.termsIndices = null;
}
}
return this.termsIndices;
}
/**
* @return the title of the passage
*/
public String getTitle() {
return title;
}
/**
* @return the type of the passage (document, paragraph or sentence)
*/
public String getType() {
return type;
}
/**
* @return the url of the passage
*/
public String getUrl() {
return url;
}
/**
* @return if the {@link TextPassage} contains any {@link Term}
*/
public boolean isEmpty() {
return this.getTerms().size() == 0;
}
/**
* Removes a {@link Term} from the passage
*
* @param term
*/
public void removeTerm(Term term) {
this.terms.remove(term.getIndex());
this.termFrequencies.remove(term.getIndex());
}
/**
* Basic output of a text passage
*/
@Override
public String toString() {
return "Text passage [" + this.getId() + "]";
}
/**
* Updates the index of a {@link Term} in the passage
*
* @param term
* the {@link Term} which index will be updated
* @param oldIndex
* the old index
* @param newIndex
* the new index
*/
public void updateTermIndex(Term term, int oldIndex, int newIndex) {
Term oldTerm = this.terms.get(oldIndex);
assert (oldTerm == term);
this.terms.remove(oldIndex);
this.terms.put(newIndex, term);
int frequency = this.termFrequencies.get(oldIndex);
this.termFrequencies.remove(oldIndex);
this.termFrequencies.put(newIndex, frequency);
}
}