package ecologylab.bigsemantics.model.text;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import ecologylab.appframework.types.prefs.Pref;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.generic.FeatureVector;
import ecologylab.generic.IFeatureVector;
import ecologylab.generic.StringTools;
/**
* TermVector represents a collection of Terms, each associated with a particular value. Usually
* this value represents the Term's frequency in a particular document, however, it may also
* represent the amount of interest in a Term, as is the case in InterestModel.java
*
* @author jmole
*
*/
public class TermVector extends FeatureVector<Term> implements ITermVector
{
private static final Double DEFAULT_WEIGHT = 1.0;
private static final String SHOW_WEIGHTS_PREF = "show_weights";
public static Pattern WORD_REGEX = Pattern.compile("[a-zA-Z]+(-[a-zA-Z]+)*([a-zA-Z]+)");
public TermVector ()
{
}
public TermVector ( IFeatureVector<Term> tv )
{
super(tv);
}
public TermVector ( int size )
{
super(size); // lol
}
/**
* Creates a new TermVector from a given String, using the TermDictionary to stem and find the
* Term associated with each word.
*
* @param input
*/
public TermVector ( CharSequence input )
{
if(input != null)
reset(input);
}
/**
* Totally reconstructs this term vector based on a new string. Useful for maintaining the
* observers and such while changing the actual terms.
*
* @param input
*/
public void reset (CharSequence input)
{
super.reset();
// StringTools.toLowerCase(s);
add(input);
}
public void add(CharSequence input)
{
add(input, DEFAULT_WEIGHT);
}
/**
* @param input
*/
public void add(CharSequence input, Double weight)
{
Matcher m = WORD_REGEX.matcher(input);
StringBuilder termBuffy = StringBuilderUtils.acquire();
while (m.find())
{
int start = m.start();
termBuffy.append(input, start, m.end());
StringTools.toLowerCase(termBuffy);
addWithoutNotify(TermDictionary.getTermForWord(termBuffy), weight);
StringTools.clear(termBuffy);
}
StringBuilderUtils.release(termBuffy);
setChanged();
notifyObservers();
}
private void addWithoutNotify ( Term term, Double val )
{
if (term == null || term.isStopword())
return;
super.add(term, val);
}
public void add ( Term term, Double val )
{
if (!term.isStopword())
{
super.add(term, val);
setChanged();
notifyObservers();
}
}
/**
* Pairwise multiplies this Vector by another Vector, in-place.
*
* @param v
* Vector by which to multiply
*/
public void multiply ( IFeatureVector<Term> v )
{
super.multiply(v);
setChanged();
notifyObservers();
}
/**
* Scalar multiplication of this vector by some constant
*
* @param c
* Constant to multiply this vector by.
*/
public void multiply ( double c )
{
super.multiply(c);
setChanged();
notifyObservers();
}
/**
* Pairwise addition of this vector by some other vector times some constant.<br>
* i.e. this + (c*v)<br>
* Vector v is not modified.
*
* @param c
* Constant which Vector v is multiplied by.
* @param v
* Vector to add to this one
*/
public void add ( double c, ITermVector v )
{
add(c, (IFeatureVector<Term>) v);
}
/**
* Adds another Vector to this Vector, in-place.
*
* @param v
* Vector to add to this
*/
public void add (ITermVector v )
{
add((IFeatureVector<Term>) v);
}
/**
* Pairwise addition of this vector by some other vector times some constant.<br>
* i.e. this + (c*v)<br>
* Vector v is not modified.
*
* @param c
* Constant which Vector v is multiplied by.
* @param v
* Vector to add to this one
*/
public void add ( double c, IFeatureVector<Term> v )
{
super.add(c, v);
setChanged();
notifyObservers();
}
/**
* Adds another Vector to this Vector, in-place.
*
* @param v
* Vector to add to this
*/
public void add ( IFeatureVector<Term> v )
{
super.add(v);
setChanged();
notifyObservers();
}
public String toString ( )
{
if (values == null)
return "{}";
StringBuilder s = new StringBuilder("{");
synchronized (values)
{
for (Term t : values.keySet())
{
s.append(t.toString());
if (Pref.usePrefBoolean(SHOW_WEIGHTS_PREF, false).value())
{
s.append("(");
s.append((int) (t.idf() * 100) / 100.);
s.append("),");
}
s.append(" ");
}
}
s.append("}");
return s.toString();
}
public String termString ( )
{
if (values == null)
return "";
StringBuilder s = new StringBuilder();
synchronized (values)
{
for (Term t : values.keySet())
{
s.append(t.getWord());
s.append(" ");
}
}
return s.toString();
}
public double idfDot ( IFeatureVector<Term> v )
{
return idfDot(v, false);
}
public double idfDotSimplex ( IFeatureVector<Term> v )
{
return idfDot(v, true);
}
private double idfDot ( IFeatureVector<Term> v, boolean simplex )
{
Map<Term, Double> other = v.map();
if (other == null || other.size() == 0 || this.values == null || this.norm() == 0 || v.norm() == 0)
return 0;
double dot = 0;
HashMap<Term, Double> vector = this.values;
synchronized (values)
{
for (Term term : vector.keySet())
{
if (other.containsKey(term))
{
double tfIDF = term.idf() * vector.get(term);
if (!simplex)
tfIDF *= other.get(term);
dot += tfIDF;
}
}
dot /= values.size();
}
return dot;
}
public void clamp ( double clampTo )
{
super.clamp(clampTo);
setChanged();
notifyObservers();
}
public void clampExp ( double clampTo )
{
super.clampExp(clampTo);
setChanged();
notifyObservers();
}
@Override
public TermVector unit ( )
{
TermVector v = new TermVector(this);
v.clamp(1);
return v;
}
@Override
public TermVector simplex ( )
{
TermVector v = new TermVector(this);
if(v.size() > 0)
{
for (Term t : v.values.keySet())
{
v.values.put(t, 1.0);
}
}
return v;
}
/**
* IDF trim (ignores tf).
* Deletes lowest weighted terms until the TermVector only has "size" terms. If "size" is
* greater than the number of Terms contained in this TermVector, this method does nothing.
*
* @param size
* the new size (i.e. number of Terms) of the TermVector.
*/
public void trim ( int size )
{
if (values == null)
return;
if (size >= values.size())
return;
synchronized (values)
{
TreeMap<Term, Double> sortedTerms = new TreeMap<Term, Double>(values); // idf sorted, because Term implements Comparable<Term>
values.clear();
for (Term t : sortedTerms.keySet())
{
if (values.size() == size)
break;
values.put(t, sortedTerms.get(t));
}
}
}
public Term[] tfIdfTrim(int size)
{
synchronized (values)
{
TreeMap<Double, Term> tfIdfMap = buildTfIdfMap();
Term[] result = new Term[size];
int i = 0;
for (Term term : tfIdfMap.values())
{
result[i++] = term;
if (i >= size)
break;
}
return result;
}
}
/**
* @return
*/
private TreeMap<Double, Term> buildTfIdfMap()
{
// build map ordered by tf-idf
TreeMap<Double, Term> tfIdfMap = new TreeMap<Double, Term>(reverse);
for (Term term : values.keySet())
{
double tf = values.get(term);
double idf = term.idf();
tfIdfMap.put(tf*idf, term);
}
return tfIdfMap;
}
public ArrayList<Term> tfIdfTrim(double threshold, TermVector ignoreTV)
{
synchronized (values)
{
TreeMap<Double, Term> tfIdfMap = buildTfIdfMap();
ArrayList<Term> result = new ArrayList<Term>();
for (Double tfIdf : tfIdfMap.keySet())
{
if (tfIdf < threshold)
continue;
Term term = tfIdfMap.get(tfIdf);
if(ignoreTV.map().get(term) == null) //This term is not in the ignoreTV
result.add(term);
}
return result;
}
}
public ArrayList<Term> tfIdfTrimByCount(int maxCount, TermVector ignoreTV)
{
synchronized (values)
{
TreeMap<Double, Term> tfIdfMap = buildTfIdfMap();
ArrayList<Term> result = new ArrayList<Term>();
int count = 0;
for (Double tfIdf : tfIdfMap.keySet())
{
count += 1;
if (count >= maxCount)
break;
Term term = tfIdfMap.get(tfIdf);
if(ignoreTV.map().get(term) == null) //This term is not in the ignoreTV
result.add(term);
}
return result;
}
}
public double tfIdfMean()
{
double result = 0;
Set<Term> keySet = values.keySet();
int n = keySet.size();
for (Term term : keySet)
{
result += tfIdf(term);
}
return result / n;
}
public double tfIdf(Term term)
{
return values.get(term) * term.idf();
}
Comparator<Double> reverse = new Comparator<Double>()
{
@Override
public int compare(Double d1, Double d2)
{
if (d2 > d1)
return 1;
else if (d2.equals(d1))
return 0;
else
return -1;
}
};
@Override
public void set ( Term term, Double val )
{
super.set(term, val);
setChanged();
notifyObservers();
}
public boolean hasObservers()
{
return countObservers() > 0;
}
}