package edu.cmu.minorthird.text.learn;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextBase;
import edu.cmu.minorthird.text.TextLabels;
/**
* Created by IntelliJ IDEA. User: rcwang Date: Mar 29, 2004 Time: 10:14:58 AM
*
* @author Richard Wang <rcwang@cmu.edu>
*/
public class FreqAnal{
private TextLabels labels=null;
private String spanType="_prediction";
private static final int TF=0; // term frequency
private static final int DF=1; // document frequency
private static final int PF=2; // predicted frequency
private static final int HS=3; // heuristic score
private static final int LAST=4;
private static Map<String,Double[]> tokenHash=new HashMap<String,Double[]>();
public FreqAnal(TextLabels labels,String spanType){
this.labels=labels;
this.spanType=spanType;
TextBase base=labels.getTextBase();
for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){
Span docSpan=i.next();
List<String> TFList=new ArrayList<String>();
List<String> PFList=new ArrayList<String>();
for(int j=0;j<docSpan.size();j++){
Span tokenSpan=docSpan.subSpan(j,1);
// if (isEmail(tokenSpan,labels)) continue;
if(isPredictedName(spanType,tokenSpan,labels))
PFList.add(tokenSpan.asString().toLowerCase());
TFList.add(tokenSpan.asString().toLowerCase());
}
List<String> DFList=uniqueList(TFList);
updateHash(TFList,TF);
updateHash(DFList,DF);
updateHash(PFList,PF);
}
updateHScore(base.size(),HS);
}
public TextLabels getLabels(){
return labels;
}
public void setLabels(TextLabels labels){
this.labels=labels;
}
public String getSpanType(){
return spanType;
}
public void setSpanType(String spanType){
this.spanType=spanType;
}
public void print(){
for(Iterator<String> it=tokenHash.keySet().iterator();it.hasNext();){
String next=it.next();
Double[] array=tokenHash.get(next);
System.out.println(next+" "+array[0]+" "+array[1]+" "+array[2]+" "+
array[3]);
}
}
private static void updateHScore(int numDoc,int type){
for(Iterator<String> i=tokenHash.keySet().iterator();i.hasNext();){
String token=i.next();
Double[] array=tokenHash.get(token);
if(array==null){
array=new Double[LAST];
for(int j=0;j<array.length;j++)
array[j]=new Double(0);
}
array[type]=TF_IDF(array,numDoc);
System.out.println(token+" "+array[3]);
tokenHash.put(token,array);
}
}
private static Double TF_IDF(Double[] array,int numDoc){
return new Double(array[PF].doubleValue()/(array[TF].doubleValue()+2)*
Math.log((numDoc+0.5)/array[DF].doubleValue())/Math.log(numDoc+1)*100);
}
private static void updateHash(List<String> list,int type){
for(Iterator<String> i=list.iterator();i.hasNext();){
String token=i.next();
Double[] array=tokenHash.get(token);
if(array==null){
array=new Double[LAST];
for(int j=0;j<array.length;j++)
array[j]=new Double(0);
}
array[type]=new Double(array[type].doubleValue()+1);
tokenHash.put(token,array);
}
}
// private static boolean isEmail(Span test,MutableTextLabels labels){
// for(Iterator<Span> i=
// labels.instanceIterator("extracted_email",test.getDocumentId());i
// .hasNext();){
// Span email=i.next();
// if(email.contains(test))
// return true;
// }
// return false;
// }
private static boolean isPredictedName(String spanType,Span test,
TextLabels labels){
for(Iterator<Span> i=labels.instanceIterator(spanType,test.getDocumentId());i
.hasNext();){
Span name=i.next();
if(name.contains(test))
return true;
}
return false;
}
public Double getHScore(String term){
Double[] array=tokenHash.get(term);
return (array!=null)?array[HS]:null;
}
private static List<String> uniqueList(List<String> list){
return new ArrayList<String>(new HashSet<String>(list));
}
}