package edu.stanford.nlp.ie;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Index;
import java.util.List;
/** This was the empirical NER prior used for long distance consistency
* in the Finkel et al. ACL 2005 paper.
*
* @author Jenny Finkel
*/
public class EmpiricalNERPrior<IN extends CoreMap> extends EntityCachingAbstractSequencePrior<IN> {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(EmpiricalNERPrior.class);
protected static final String ORG = "ORGANIZATION";
protected static final String PER = "PERSON";
protected static final String LOC = "LOCATION";
protected static final String MISC = "MISC";
public EmpiricalNERPrior(String backgroundSymbol, Index<String> classIndex, List<IN> doc) {
super(backgroundSymbol, classIndex, doc);
}
protected double p1 = -Math.log(0.01);
protected double dem1 = 6631.0;
protected double p2 = -Math.log(6436.0 / dem1)/2.0;
protected double p3 = -Math.log(188 / dem1)/2.0;
protected double p4 = -Math.log(4 / dem1)/2.0;
protected double p5 = -Math.log(3 / dem1)/2.0;
protected double dem2 = 3169.0;
protected double p6 = -Math.log(188.0 / dem2)/2.0;
protected double p7 = -Math.log(2975 / dem2)/2.0;
protected double p8 = -Math.log(5 / dem2)/2.0;
protected double p9 = -Math.log(1 / dem2)/2.0;
protected double dem3 = 3151.0;
protected double p10 = -Math.log(4.0 / dem3)/2.0;
protected double p11 = -Math.log(5 / dem3)/2.0;
protected double p12 = -Math.log(3141 / dem3)/2.0;
protected double p13 = -Math.log(1 / dem3)/2.0;
protected double dem4 = 2035.0;
protected double p14 = -Math.log(3.0 / dem4)/2.0;
protected double p15 = -Math.log(1 / dem4)/2.0;
protected double p16 = -Math.log(1 / dem4)/2.0;
protected double p17 = -Math.log(2030 / dem4)/2.0;
protected double dem5 = 724.0;
protected double p18 = -Math.log(167.0 / dem5);
protected double p19 = -Math.log(328.0 / dem5);
protected double p20 = -Math.log(5.0 / dem5);
protected double p21 = -Math.log(224.0 / dem5);
protected double dem6 = 834.0;
protected double p22 = -Math.log(6.0 / dem6);
protected double p23 = -Math.log(819.0 / dem6);
protected double p24 = -Math.log(2.0 / dem6);
protected double p25 = -Math.log(7.0 / dem6);
protected double dem7 = 1978.0;
protected double p26 = -Math.log(1.0 / dem7);
protected double p27 = -Math.log(22.0 / dem7);
protected double p28 = -Math.log(1941.0 / dem7);
protected double p29 = -Math.log(14.0 / dem7);
protected double dem8 = 622.0;
protected double p30 = -Math.log(63.0 / dem8);
protected double p31 = -Math.log(191.0 / dem8);
protected double p32 = -Math.log(3.0 / dem8);
protected double p33 = -Math.log(365.0 / dem8);
@SuppressWarnings("StringEquality")
@Override
public double scoreOf(int[] sequence) {
double p = 0.0;
for (int i = 0; i < entities.length; i++) {
Entity entity = entities[i];
//log.info(entity);
if ((i == 0 || entities[i-1] != entity) && entity != null) {
//log.info(1);
int length = entity.words.size();
String tag1 = classIndex.get(entity.type);
// Use canonical String values, so we can henceforth just use ==
if (tag1.equals(LOC)) { tag1 = LOC; }
else if (tag1.equals(ORG)) { tag1 = ORG; }
else if (tag1.equals(PER)) { tag1 = PER; }
else if (tag1.equals(MISC)) { tag1 = MISC; }
int[] other = entities[i].otherOccurrences;
for (int otherOccurrence : other) {
Entity otherEntity = null;
for (int k = otherOccurrence; k < otherOccurrence + length && k < entities.length; k++) {
otherEntity = entities[k];
if (otherEntity != null) {
// if (k > other[j]) {
// log.info(entity.words+" "+otherEntity.words);
// }
break;
}
}
// singleton + other instance null?
if (otherEntity == null) {
//p -= length * Math.log(0.1);
//if (entity.words.size() == 1) {
//p -= length * p1;
//}
continue;
}
int oLength = otherEntity.words.size();
String tag2 = classIndex.get(otherEntity.type);
// Use canonical String values, so we can henceforth just use ==
if (tag2.equals(LOC)) {
tag2 = LOC;
} else if (tag2.equals(ORG)) {
tag2 = ORG;
} else if (tag2.equals(PER)) {
tag2 = PER;
} else if (tag2.equals(MISC)) {
tag2 = MISC;
}
// exact match??
boolean exact = false;
int[] oOther = otherEntity.otherOccurrences;
for (int index : oOther) {
if (index >= i && index <= i + length - 1) {
exact = true;
break;
}
}
if (exact) {
// entity not complete
if (length != oLength) {
if (tag1 == (tag2)) {// || ((tag1 == LOC && tag2 == ORG) || (tag1 == ORG && tag2 == LOC))) { // ||
//p -= Math.abs(oLength - length) * Math.log(0.1);
p -= Math.abs(oLength - length) * p1;
} else if (!(tag1.equals(ORG) && tag2.equals(LOC)) &&
!(tag2.equals(LOC) && tag1.equals(ORG))) {
// shorter
p -= (oLength + length) * p1;
}
}
if (tag1 == (LOC)) {
if (tag2 == (LOC)) {
//p -= length * Math.log(6436.0 / dem);
//p -= length * p2;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(188 / dem);
p -= length * p3;
} else if (tag2 == (PER)) {
//p -= length * Math.log(4 / dem);
p -= length * p4;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(3 / dem);
p -= length * p5;
}
} else if (tag1 == (ORG)) {
//double dem = 3169.0;
if (tag2 == (LOC)) {
//p -= length * Math.log(188.0 / dem);
p -= length * p6;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(2975 / dem);
//p -= length * p7;
} else if (tag2 == (PER)) {
//p -= length * Math.log(5 / dem);
p -= length * p8;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(1 / dem);
p -= length * p9;
}
} else if (tag1 == (PER)) {
//double dem = 3151.0;
if (tag2 == (LOC)) {
//p -= length * Math.log(4.0 / dem);
p -= length * p10;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(5 / dem);
p -= length * p11;
} else if (tag2 == (PER)) {
//p -= length * Math.log(3141 / dem);
//p -= length * p12;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(1 / dem);
p -= length * p13;
}
} else if (tag1 == (MISC)) {
//double dem = 2035.0;
if (tag2 == (LOC)) {
//p -= length * Math.log(3.0 / dem);
p -= length * p14;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(1 / dem);
p -= length * p15;
} else if (tag2 == (PER)) {
//p -= length * Math.log(1 / dem);
p -= length * p16;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(2030 / dem);
//p -= length * p17;
}
}
} else {
if (tag1 == (LOC)) {
//double dem = 724.0;
if (tag2 == (LOC)) {
//p -= length * Math.log(167.0 / dem);
//p -= length * p18;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(328.0 / dem);
//p -= length * p19;
} else if (tag2 == (PER)) {
//p -= length * Math.log(5.0 / dem);
p -= length * p20;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(224.0 / dem);
p -= length * p21;
}
} else if (tag1 == (ORG)) {
//double dem = 834.0;
if (tag2 == (LOC)) {
//p -= length * Math.log(6.0 / dem);
p -= length * p22;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(819.0 / dem);
//p -= length * p23;
} else if (tag2 == (PER)) {
//p -= length * Math.log(2.0 / dem);
p -= length * p24;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(7.0 / dem);
p -= length * p25;
}
} else if (tag1 == (PER)) {
//double dem = 1978.0;
if (tag2 == (LOC)) {
//p -= length * Math.log(1.0 / dem);
p -= length * p26;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(22.0 / dem);
p -= length * p27;
} else if (tag2 == (PER)) {
//p -= length * Math.log(1941.0 / dem);
//p -= length * p28;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(14.0 / dem);
p -= length * p29;
}
} else if (tag1 == (MISC)) {
//double dem = 622.0;
if (tag2 == (LOC)) {
//p -= length * Math.log(63.0 / dem);
p -= length * p30;
} else if (tag2 == (ORG)) {
//p -= length * Math.log(191.0 / dem);
p -= length * p31;
} else if (tag2 == (PER)) {
//p -= length * Math.log(3.0 / dem);
p -= length * p32;
} else if (tag2 == (MISC)) {
//p -= length * Math.log(365.0 / dem);
p -= length * p33;
}
}
}
// if (tag1 == PER) {
// int personIndex = classIndex.indexOf(PER);
// String lastName = entity.words.get(entity.words.size()-1);
// for (int k = 0; k < doc.size(); k++) {
// String w = doc.get(k).word();
// if (w.equalsIgnoreCase(lastName)) {
// if (sequence[k] != personIndex) {
// p -= p1;
// }
// }
// }
// }
}
}
}
return p;
}
}