/*******************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code was developed by the Information Integration Group as part
* of the Karma project at the Information Sciences Institute of the
* University of Southern California. For more information, publications,
* and related projects, please see: http://www.isi.edu/integration
******************************************************************************/
package edu.isi.karma.cleaning.features;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Vector;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.CharStream;
import org.antlr.runtime.Token;
import edu.isi.karma.cleaning.TNode;
import edu.isi.karma.cleaning.Tokenizer;
public class RegularityFeatureSet implements FeatureSet {
public ArrayList<Vector<TNode>> tokenseqs;
public ArrayList<Vector<TNode>> otokenseqs;
public Vector<String> fnames;
public static String[] targets = {"#",";",",","!","~","@","$","%","^","&","*","(",")","_","-","{","}","[","]","\"","\'",":","?","<",">",".","bnk","syb","wrd","num"};
public RegularityFeatureSet()
{
tokenseqs = new ArrayList<Vector<TNode>>();
otokenseqs= new ArrayList<Vector<TNode>>();
fnames = new Vector<String>();
}
public Vector<TNode> tokenizer(String Org)
{
CharStream cs = new ANTLRStringStream(Org);
Tokenizer tk = new Tokenizer(cs);
Token t;
t = tk.nextToken();
Vector<TNode> x = new Vector<TNode>();
while(t.getType()!=-1)
{
int mytype = -1;
if(t.getType()==15)
{
mytype = TNode.UWRDTYP;
}
else if(t.getType() == 4)
{
mytype = TNode.BNKTYP;
}
else if(t.getType() == 10)
{
mytype = TNode.NUMTYP;
}
else if(t.getType() == 12)
{
mytype = TNode.SYBSTYP;
}
else if(t.getType() == 9)
{
mytype = TNode.LWRDTYP;
}
TNode tx = new TNode(mytype,t.getText());
x.add(tx);
t = tk.nextToken();
}
return x;
}
public Collection<Feature> computeFeatures(Collection<String> examples,Collection<String> oexamples) {
Vector<Feature> r = new Vector<Feature>();
for(String s:examples)
{
Vector<TNode> x = this.tokenizer(s);
this.tokenseqs.add(x);
}
for(String s:oexamples)
{
Vector<TNode> x = this.tokenizer(s);
this.otokenseqs.add(x);
}
//counting feature
String[] symbol = {"#",";",",","!","~","@","$","%","^","&","*","(",")","_","-","{","}","[","]","\"","'",":","?","<",">","."};
Vector<CntFeature> cntfs = new Vector<CntFeature>(symbol.length);
//moving feature
Vector<MovFeature> movfs = new Vector<MovFeature>(symbol.length);
for(int i=0; i<symbol.length;i++)
{
TNode t = new TNode(TNode.SYBSTYP,symbol[i]);
Vector<TNode> li = new Vector<TNode>();
li.add(t);
cntfs.add(i,new CntFeature(this.otokenseqs,this.tokenseqs,li));
cntfs.get(i).setName("entr_cnt_"+symbol[i]);
movfs.add(i,new MovFeature(this.otokenseqs,this.tokenseqs,li));
movfs.get(i).setName("entr_mov"+symbol[i]);
}
//count the blank, symbol wrd and number token
TNode t = new TNode(TNode.BNKTYP,TNode.ANYTOK);
Vector<TNode> li = new Vector<TNode>();
li.add(t);
CntFeature cf = new CntFeature(this.otokenseqs,this.tokenseqs,li);
cf.setName("entr_cnt_bnk");
TNode t1 = new TNode(TNode.SYBSTYP,TNode.ANYTOK);
Vector<TNode> li1 = new Vector<TNode>();
li1.add(t1);
CntFeature cf1 = new CntFeature(this.otokenseqs,this.tokenseqs,li1);
cf1.setName("entr_cnt_syb");
TNode t2 = new TNode(TNode.LWRDTYP,TNode.ANYTOK);
Vector<TNode> li2 = new Vector<TNode>();
li2.add(t2);
CntFeature cf2 = new CntFeature(this.otokenseqs,this.tokenseqs,li2);
cf2.setName("entr_cnt_lwrd");
TNode t3 = new TNode(TNode.NUMTYP,TNode.ANYTOK);
Vector<TNode> li3 = new Vector<TNode>();
li3.add(t3);
CntFeature cf3 = new CntFeature(this.otokenseqs,this.tokenseqs,li3);
cf3.setName("entr_cnt_num");
/*TNode t4 = new TNode(TNode.UWRDTYP,TNode.ANYTOK);
Vector<TNode> li4 = new Vector<TNode>();
li3.add(t4);
CntFeature cf4 = new CntFeature(this.otokenseqs,this.tokenseqs,li4);
cf3.setName("entr_cnt_num");*/
cntfs.add(cf);
cntfs.add(cf1);
cntfs.add(cf2);
cntfs.add(cf3);
//cntfs.add(cf4);
r.addAll(cntfs);
r.addAll(movfs);
for(int i= 0; i<r.size();i++)
{
fnames.add(r.get(i).getName());
}
return r;
}
public static void buildEntropy(double a,int[] buk)
{
int buks[] = buk;
if(a>=0.0 && a<0.1)
{
buks[0] += 1;
}
else if(a>=0.1 && a<0.2)
{
buks[1] += 1;
}
else if(a>=0.2 && a<0.3)
{
buks[2] += 1;
}
else if(a>=0.3 && a<0.4)
{
buks[3] += 1;
}
else if(a>=0.4 && a<0.5)
{
buks[4] += 1;
}
else if(a>=0.5 && a<0.6)
{
buks[5] += 1;
}
else if(a>=0.6 && a<0.7)
{
buks[6] += 1;
}
else if(a>=0.7 && a<0.8)
{
buks[7] += 1;
}
else if(a>=0.8 && a<0.9)
{
buks[8] += 1;
}
else if(a>=0.9 && a<=1.0)
{
buks[9] += 1;
}
}
public static double calShannonEntropy(int[] a)
{
int cnt = 0;
for(int c:a)
{
cnt += c;
}
if(cnt==0)
return Math.log(10);//
double entropy = 0.0;
for(int i=0;i<a.length;i++)
{
double freq = a[i]*1.0/cnt;
if(freq==0)
continue;
entropy -= freq*Math.log(freq);
}
return entropy;
}
public Collection<String> getFeatureNames() {
return fnames;
}
}