package edu.stanford.nlp.tagger.maxent;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.tagger.common.Tagger;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import java.io.IOException;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.util.*;
/**
* This class holds the POS tags, assigns them unique ids, and knows which tags
* are open versus closed class.
* <p/>
* Title: StanfordMaxEnt<p>
* Description: A Maximum Entropy Toolkit<p>
* Company: Stanford University<p>
*
* @author Kristina Toutanova
* @version 1.0
*/
public class TTags {
private Index<String> index = new HashIndex<>();
private final Set<String> closed = Generics.newHashSet();
private Set<String> openTags = null; /* cache */
private final boolean isEnglish; // for speed
private static final boolean doDeterministicTagExpansion = true;
/** If true, then the open tags are fixed and we set closed tags based on
* index-openTags; otherwise, we set open tags based on index-closedTags.
*/
private boolean openFixed = false;
/** When making a decision based on the training data as to whether a
* tag is closed, this is the threshold for how many tokens can be in
* a closed class - purposely conservative.
* TODO: make this an option you can set; need to pass in TaggerConfig object and then can say = config.getClosedTagThreshold());
*/
private final int closedTagThreshold = Integer.parseInt(TaggerConfig.CLOSED_CLASS_THRESHOLD);
/** If true, when a model is trained, all tags that had fewer tokens than
* closedTagThreshold will be considered closed.
*/
private boolean learnClosedTags = false;
public TTags() {
isEnglish = false;
}
/*
public TTags(TaggerConfig config) {
String[] closedArray = config.getClosedClassTags();
String[] openArray = config.getOpenClassTags();
if(closedArray.length > 0) {
closed = Generics.newHashSet(Arrays.asList(closedArray));
} else if(openArray.length > 0) {
openTags = Generics.newHashSet(Arrays.asList(openArray));
} else {
learnClosedTags = config.getLearnClosedClassTags();
closedTagThreshold = config.getClosedTagThreshold();
}
}
*/
TTags(String language) {
if (language.equalsIgnoreCase("english")) {
closed.add(".");
closed.add(",");
closed.add("``");
closed.add("''");
closed.add(":");
closed.add("$");
closed.add("EX");
closed.add("(");
closed.add(")");
closed.add("#");
closed.add("MD");
closed.add("CC");
closed.add("DT");
closed.add("LS");
closed.add("PDT");
closed.add("POS");
closed.add("PRP");
closed.add("PRP$");
closed.add("RP");
closed.add("TO");
closed.add(Tagger.EOS_TAG);
closed.add("UH");
closed.add("WDT");
closed.add("WP");
closed.add("WP$");
closed.add("WRB");
closed.add("-LRB-");
closed.add("-RRB-");
// closed.add("IN");
isEnglish = true;
} else if(language.equalsIgnoreCase("polish")) {
closed.add(".");
closed.add(",");
closed.add("``");
closed.add("''");
closed.add(":");
closed.add("$");
closed.add("(");
closed.add(")");
closed.add("#");
closed.add("POS");
closed.add(Tagger.EOS_TAG);
closed.add("ppron12");
closed.add("ppron3");
closed.add("siebie");
closed.add("qub");
closed.add("conj");
isEnglish = false;
} else if(language.equalsIgnoreCase("chinese")) {
/* chinese treebank 5 tags */
closed.add("AS");
closed.add("BA");
closed.add("CC");
closed.add("CS");
closed.add("DEC");
closed.add("DEG");
closed.add("DER");
closed.add("DEV");
closed.add("DT");
closed.add("ETC");
closed.add("IJ");
closed.add("LB");
closed.add("LC");
closed.add("P");
closed.add("PN");
closed.add("PU");
closed.add("SB");
closed.add("SP");
closed.add("VC");
closed.add("VE");
isEnglish = false;
} else if (language.equalsIgnoreCase("arabic")) {
// kulick tag set
// the following tags seem to be complete sets in the training
// data (see the comments for "german" for more info)
closed.add("PUNC");
closed.add("CC");
closed.add("CPRP$");
closed.add(Tagger.EOS_TAG);
// maybe more should still be added ... cdm jun 2006
isEnglish = false;
} else if(language.equalsIgnoreCase("german")) {
// The current version of the German tagger is built with the
// negra-tiger data set. We use the STTS tag set. In
// particular, we use the version with the changes described in
// appendix A-2 of
// http://www.uni-potsdam.de/u/germanistik/ls_dgs/tiger1-intro.pdf
// eg the STTS tag set with PROAV instead of PAV
// To find the closed tags, we use lists of standard closed German
// tags, eg
// http://www.sfs.uni-tuebingen.de/Elwis/stts/Wortlisten/WortFormen.html
// In other words:
//
// APPO APPR APPRART APZR ART KOKOM KON KOUI KOUS PDAT PDS PIAT
// PIDAT PIS PPER PPOSAT PPOSS PRELAT PRELS PRF PROAV PTKA
// PTKANT PTKNEG PTKVZ PTKZU PWAT PWAV PWS VAFIN VAIMP VAINF
// VAPP VMFIN VMINF VMPP
//
// One issue with this is that our training data does not have
// the complete collection of many of these closed tags. For
// example, words with the tag APPR show up in the test or dev
// sets without ever showing up in the training. Tags that
// don't have this property:
//
// KOKOM PPOSS PTKA PTKNEG PWAT VAINF VAPP VMINF VMPP
closed.add("$,");
closed.add("$.");
closed.add("$(");
closed.add("--"); // this shouldn't be a tag of the dataset, but was a conversion bug!
closed.add(Tagger.EOS_TAG);
closed.add("KOKOM");
closed.add("PPOSS");
closed.add("PTKA");
closed.add("PTKNEG");
closed.add("PWAT");
closed.add("VAINF");
closed.add("VAPP");
closed.add("VMINF");
closed.add("VMPP");
isEnglish = false;
} else if (language.equalsIgnoreCase("french")) {
// Using the french treebank, with Spence's adaptations of
// Candito's treebank modifications, we get that only the
// punctuation tags are reliably closed:
// !, ", *, ,, -, -LRB-, -RRB-, ., ..., /, :, ;, =, ?, [, ]
closed.add("!");
closed.add("\"");
closed.add("*");
closed.add(",");
closed.add("-");
closed.add("-LRB-");
closed.add("-RRB-");
closed.add(".");
closed.add("...");
closed.add("/");
closed.add(":");
closed.add(";");
closed.add("=");
closed.add("?");
closed.add("[");
closed.add("]");
isEnglish = false;
} else if (language.equalsIgnoreCase("spanish")) {
closed.add(Tagger.EOS_TAG);
// conjunctions
closed.add("cc");
closed.add("cs");
// punctuation
closed.add("faa");
closed.add("fat");
closed.add("fc");
closed.add("fca");
closed.add("fct");
closed.add("fd");
closed.add("fe");
closed.add("fg");
closed.add("fh");
closed.add("fia");
closed.add("fit");
closed.add("fla");
closed.add("flt");
closed.add("fp");
closed.add("fpa");
closed.add("fpt");
closed.add("fra");
closed.add("frc");
closed.add("fs");
closed.add("ft");
closed.add("fx");
closed.add("fz");
isEnglish = false;
} else if (language.equalsIgnoreCase("medpost")) {
closed.add(".");
closed.add(",");
closed.add("``");
closed.add("''");
closed.add(":");
closed.add("$");
closed.add("EX");
closed.add("(");
closed.add(")");
closed.add("VM");
closed.add("CC");
closed.add("DD");
closed.add("DB");
closed.add("GE");
closed.add("PND");
closed.add("PNG");
closed.add("TO");
closed.add(Tagger.EOS_TAG);
closed.add("-LRB-");
closed.add("-RRB-");
isEnglish = false;
} else if (language.equalsIgnoreCase("testing")) {
closed.add(".");
closed.add(Tagger.EOS_TAG);
isEnglish = false;
} else if (language.equalsIgnoreCase("")) {
isEnglish = false;
}
/* add closed-class lists for other languages here */
else {
throw new RuntimeException("unknown language: " + language);
}
}
/** Return the Set of tags used by this tagger (available after training the tagger).
*
* @return The Set of tags used by this tagger
*/
public Set<String> tagSet() {
return new HashSet<>(index.objectsList());
}
/**
* Returns a list of all open class tags
* @return set of open tags
*/
public Set<String> getOpenTags() {
if (openTags == null) { /* cache check */
Set<String> open = Generics.newHashSet();
for (String tag : index) {
if ( ! closed.contains(tag)) {
open.add(tag);
}
}
openTags = open;
} // if
return openTags;
}
protected int add(String tag) {
return index.addToIndex(tag);
}
public String getTag(int i) {
return index.get(i);
}
protected void save(String filename,
Map<String, Set<String>> tagTokens) {
try {
DataOutputStream out = IOUtils.getDataOutputStream(filename);
save(out, tagTokens);
out.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
protected void save(DataOutputStream file,
Map<String, Set<String>> tagTokens) {
try {
file.writeInt(index.size());
for (String item : index) {
file.writeUTF(item);
if (learnClosedTags) {
if (tagTokens.get(item).size() < closedTagThreshold) {
markClosed(item);
}
}
file.writeBoolean(isClosed(item));
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
protected void read(String filename) {
try {
DataInputStream in = IOUtils.getDataInputStream(filename);
read(in);
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
protected void read(DataInputStream file) {
try {
int size = file.readInt();
index = new HashIndex<>();
for (int i = 0; i < size; i++) {
String tag = file.readUTF();
boolean inClosed = file.readBoolean();
index.add(tag);
if (inClosed) closed.add(tag);
}
} catch (IOException e) {
e.printStackTrace();
}
}
protected boolean isClosed(String tag) {
if (openFixed) {
return !openTags.contains(tag);
} else {
return closed.contains(tag);
}
}
void markClosed(String tag) {
add(tag);
closed.add(tag);
}
public void setLearnClosedTags(boolean learn) {
learnClosedTags = learn;
}
public void setOpenClassTags(String[] openClassTags) {
openTags = Generics.newHashSet();
openTags.addAll(Arrays.asList(openClassTags));
for (String tag : openClassTags) {
add(tag);
}
openFixed = true;
}
public void setClosedClassTags(String[] closedClassTags) {
for(String tag : closedClassTags) {
markClosed(tag);
}
}
int getIndex(String tag) {
return index.indexOf(tag);
}
public int getSize() {
return index.size();
}
/**
* Deterministically adds other possible tags for words given observed tags.
* For instance, for English with the Penn POS tag, a word with the VB
* tag would also be expected to have the VBP tag.
* <p>
* The current implementation is a bit contorted, as it works to avoid
* object allocations wherever possible for maximum runtime speed. But
* intuitively it's just: For English (only),
* if the VBD tag is present but not VBN, add it, and vice versa;
* if the VB tag is present but not VBP, add it, and vice versa.
*
* @param tags Known possible tags for the word
* @return A superset of tags
*/
String[] deterministicallyExpandTags(String[] tags) {
if (isEnglish && doDeterministicTagExpansion) {
boolean seenVBN = false;
boolean seenVBD = false;
boolean seenVB = false;
boolean seenVBP = false;
for (String tag : tags) {
char ch = tag.charAt(0);
if (ch == 'V') {
switch (tag) {
case "VBD":
seenVBD = true;
break;
case "VBN":
seenVBN = true;
break;
case "VB":
seenVB = true;
break;
case "VBP":
seenVBP = true;
break;
}
}
}
int toAdd = 0;
if ((seenVBN ^ seenVBD)) { // ^ is xor
toAdd++;
}
if (seenVB ^ seenVBP) {
toAdd++;
}
if (toAdd > 0) {
int ind = tags.length;
String[] newTags = new String[ind + toAdd];
System.arraycopy(tags, 0, newTags, 0, tags.length);
if (seenVBN && ! seenVBD) {
newTags[ind++] = "VBD";
} else if (seenVBD && ! seenVBN) {
newTags[ind++] = "VBN";
}
if (seenVB && ! seenVBP) {
newTags[ind] = "VBP";
} else if (seenVBP && ! seenVB) {
newTags[ind] = "VB";
}
return newTags;
} else {
return tags;
}
} else {
// no tag expansion for other languages currently
return tags;
}
}
@Override
public String toString() {
StringBuilder s = new StringBuilder();
s.append(index.toString());
s.append(' ');
if (openFixed) {
s.append(" OPEN:").append(getOpenTags());
} else {
s.append(" open:").append(getOpenTags()).append(" CLOSED:").append(closed);
}
return s.toString();
}
}