package edu.stanford.nlp.tagger.maxent; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.tagger.common.Tagger; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.HashIndex; import edu.stanford.nlp.util.Index; import java.io.IOException; import java.io.DataInputStream; import java.io.DataOutputStream; import java.util.*; /** * This class holds the POS tags, assigns them unique ids, and knows which tags * are open versus closed class. * <p/> * Title: StanfordMaxEnt<p> * Description: A Maximum Entropy Toolkit<p> * Company: Stanford University<p> * * @author Kristina Toutanova * @version 1.0 */ public class TTags { private Index<String> index = new HashIndex<>(); private final Set<String> closed = Generics.newHashSet(); private Set<String> openTags = null; /* cache */ private final boolean isEnglish; // for speed private static final boolean doDeterministicTagExpansion = true; /** If true, then the open tags are fixed and we set closed tags based on * index-openTags; otherwise, we set open tags based on index-closedTags. */ private boolean openFixed = false; /** When making a decision based on the training data as to whether a * tag is closed, this is the threshold for how many tokens can be in * a closed class - purposely conservative. * TODO: make this an option you can set; need to pass in TaggerConfig object and then can say = config.getClosedTagThreshold()); */ private final int closedTagThreshold = Integer.parseInt(TaggerConfig.CLOSED_CLASS_THRESHOLD); /** If true, when a model is trained, all tags that had fewer tokens than * closedTagThreshold will be considered closed. */ private boolean learnClosedTags = false; public TTags() { isEnglish = false; } /* public TTags(TaggerConfig config) { String[] closedArray = config.getClosedClassTags(); String[] openArray = config.getOpenClassTags(); if(closedArray.length > 0) { closed = Generics.newHashSet(Arrays.asList(closedArray)); } else if(openArray.length > 0) { openTags = Generics.newHashSet(Arrays.asList(openArray)); } else { learnClosedTags = config.getLearnClosedClassTags(); closedTagThreshold = config.getClosedTagThreshold(); } } */ TTags(String language) { if (language.equalsIgnoreCase("english")) { closed.add("."); closed.add(","); closed.add("``"); closed.add("''"); closed.add(":"); closed.add("$"); closed.add("EX"); closed.add("("); closed.add(")"); closed.add("#"); closed.add("MD"); closed.add("CC"); closed.add("DT"); closed.add("LS"); closed.add("PDT"); closed.add("POS"); closed.add("PRP"); closed.add("PRP$"); closed.add("RP"); closed.add("TO"); closed.add(Tagger.EOS_TAG); closed.add("UH"); closed.add("WDT"); closed.add("WP"); closed.add("WP$"); closed.add("WRB"); closed.add("-LRB-"); closed.add("-RRB-"); // closed.add("IN"); isEnglish = true; } else if(language.equalsIgnoreCase("polish")) { closed.add("."); closed.add(","); closed.add("``"); closed.add("''"); closed.add(":"); closed.add("$"); closed.add("("); closed.add(")"); closed.add("#"); closed.add("POS"); closed.add(Tagger.EOS_TAG); closed.add("ppron12"); closed.add("ppron3"); closed.add("siebie"); closed.add("qub"); closed.add("conj"); isEnglish = false; } else if(language.equalsIgnoreCase("chinese")) { /* chinese treebank 5 tags */ closed.add("AS"); closed.add("BA"); closed.add("CC"); closed.add("CS"); closed.add("DEC"); closed.add("DEG"); closed.add("DER"); closed.add("DEV"); closed.add("DT"); closed.add("ETC"); closed.add("IJ"); closed.add("LB"); closed.add("LC"); closed.add("P"); closed.add("PN"); closed.add("PU"); closed.add("SB"); closed.add("SP"); closed.add("VC"); closed.add("VE"); isEnglish = false; } else if (language.equalsIgnoreCase("arabic")) { // kulick tag set // the following tags seem to be complete sets in the training // data (see the comments for "german" for more info) closed.add("PUNC"); closed.add("CC"); closed.add("CPRP$"); closed.add(Tagger.EOS_TAG); // maybe more should still be added ... cdm jun 2006 isEnglish = false; } else if(language.equalsIgnoreCase("german")) { // The current version of the German tagger is built with the // negra-tiger data set. We use the STTS tag set. In // particular, we use the version with the changes described in // appendix A-2 of // http://www.uni-potsdam.de/u/germanistik/ls_dgs/tiger1-intro.pdf // eg the STTS tag set with PROAV instead of PAV // To find the closed tags, we use lists of standard closed German // tags, eg // http://www.sfs.uni-tuebingen.de/Elwis/stts/Wortlisten/WortFormen.html // In other words: // // APPO APPR APPRART APZR ART KOKOM KON KOUI KOUS PDAT PDS PIAT // PIDAT PIS PPER PPOSAT PPOSS PRELAT PRELS PRF PROAV PTKA // PTKANT PTKNEG PTKVZ PTKZU PWAT PWAV PWS VAFIN VAIMP VAINF // VAPP VMFIN VMINF VMPP // // One issue with this is that our training data does not have // the complete collection of many of these closed tags. For // example, words with the tag APPR show up in the test or dev // sets without ever showing up in the training. Tags that // don't have this property: // // KOKOM PPOSS PTKA PTKNEG PWAT VAINF VAPP VMINF VMPP closed.add("$,"); closed.add("$."); closed.add("$("); closed.add("--"); // this shouldn't be a tag of the dataset, but was a conversion bug! closed.add(Tagger.EOS_TAG); closed.add("KOKOM"); closed.add("PPOSS"); closed.add("PTKA"); closed.add("PTKNEG"); closed.add("PWAT"); closed.add("VAINF"); closed.add("VAPP"); closed.add("VMINF"); closed.add("VMPP"); isEnglish = false; } else if (language.equalsIgnoreCase("french")) { // Using the french treebank, with Spence's adaptations of // Candito's treebank modifications, we get that only the // punctuation tags are reliably closed: // !, ", *, ,, -, -LRB-, -RRB-, ., ..., /, :, ;, =, ?, [, ] closed.add("!"); closed.add("\""); closed.add("*"); closed.add(","); closed.add("-"); closed.add("-LRB-"); closed.add("-RRB-"); closed.add("."); closed.add("..."); closed.add("/"); closed.add(":"); closed.add(";"); closed.add("="); closed.add("?"); closed.add("["); closed.add("]"); isEnglish = false; } else if (language.equalsIgnoreCase("spanish")) { closed.add(Tagger.EOS_TAG); // conjunctions closed.add("cc"); closed.add("cs"); // punctuation closed.add("faa"); closed.add("fat"); closed.add("fc"); closed.add("fca"); closed.add("fct"); closed.add("fd"); closed.add("fe"); closed.add("fg"); closed.add("fh"); closed.add("fia"); closed.add("fit"); closed.add("fla"); closed.add("flt"); closed.add("fp"); closed.add("fpa"); closed.add("fpt"); closed.add("fra"); closed.add("frc"); closed.add("fs"); closed.add("ft"); closed.add("fx"); closed.add("fz"); isEnglish = false; } else if (language.equalsIgnoreCase("medpost")) { closed.add("."); closed.add(","); closed.add("``"); closed.add("''"); closed.add(":"); closed.add("$"); closed.add("EX"); closed.add("("); closed.add(")"); closed.add("VM"); closed.add("CC"); closed.add("DD"); closed.add("DB"); closed.add("GE"); closed.add("PND"); closed.add("PNG"); closed.add("TO"); closed.add(Tagger.EOS_TAG); closed.add("-LRB-"); closed.add("-RRB-"); isEnglish = false; } else if (language.equalsIgnoreCase("testing")) { closed.add("."); closed.add(Tagger.EOS_TAG); isEnglish = false; } else if (language.equalsIgnoreCase("")) { isEnglish = false; } /* add closed-class lists for other languages here */ else { throw new RuntimeException("unknown language: " + language); } } /** Return the Set of tags used by this tagger (available after training the tagger). * * @return The Set of tags used by this tagger */ public Set<String> tagSet() { return new HashSet<>(index.objectsList()); } /** * Returns a list of all open class tags * @return set of open tags */ public Set<String> getOpenTags() { if (openTags == null) { /* cache check */ Set<String> open = Generics.newHashSet(); for (String tag : index) { if ( ! closed.contains(tag)) { open.add(tag); } } openTags = open; } // if return openTags; } protected int add(String tag) { return index.addToIndex(tag); } public String getTag(int i) { return index.get(i); } protected void save(String filename, Map<String, Set<String>> tagTokens) { try { DataOutputStream out = IOUtils.getDataOutputStream(filename); save(out, tagTokens); out.close(); } catch (IOException e) { throw new RuntimeIOException(e); } } protected void save(DataOutputStream file, Map<String, Set<String>> tagTokens) { try { file.writeInt(index.size()); for (String item : index) { file.writeUTF(item); if (learnClosedTags) { if (tagTokens.get(item).size() < closedTagThreshold) { markClosed(item); } } file.writeBoolean(isClosed(item)); } } catch (IOException e) { throw new RuntimeIOException(e); } } protected void read(String filename) { try { DataInputStream in = IOUtils.getDataInputStream(filename); read(in); in.close(); } catch (IOException e) { e.printStackTrace(); } } protected void read(DataInputStream file) { try { int size = file.readInt(); index = new HashIndex<>(); for (int i = 0; i < size; i++) { String tag = file.readUTF(); boolean inClosed = file.readBoolean(); index.add(tag); if (inClosed) closed.add(tag); } } catch (IOException e) { e.printStackTrace(); } } protected boolean isClosed(String tag) { if (openFixed) { return !openTags.contains(tag); } else { return closed.contains(tag); } } void markClosed(String tag) { add(tag); closed.add(tag); } public void setLearnClosedTags(boolean learn) { learnClosedTags = learn; } public void setOpenClassTags(String[] openClassTags) { openTags = Generics.newHashSet(); openTags.addAll(Arrays.asList(openClassTags)); for (String tag : openClassTags) { add(tag); } openFixed = true; } public void setClosedClassTags(String[] closedClassTags) { for(String tag : closedClassTags) { markClosed(tag); } } int getIndex(String tag) { return index.indexOf(tag); } public int getSize() { return index.size(); } /** * Deterministically adds other possible tags for words given observed tags. * For instance, for English with the Penn POS tag, a word with the VB * tag would also be expected to have the VBP tag. * <p> * The current implementation is a bit contorted, as it works to avoid * object allocations wherever possible for maximum runtime speed. But * intuitively it's just: For English (only), * if the VBD tag is present but not VBN, add it, and vice versa; * if the VB tag is present but not VBP, add it, and vice versa. * * @param tags Known possible tags for the word * @return A superset of tags */ String[] deterministicallyExpandTags(String[] tags) { if (isEnglish && doDeterministicTagExpansion) { boolean seenVBN = false; boolean seenVBD = false; boolean seenVB = false; boolean seenVBP = false; for (String tag : tags) { char ch = tag.charAt(0); if (ch == 'V') { switch (tag) { case "VBD": seenVBD = true; break; case "VBN": seenVBN = true; break; case "VB": seenVB = true; break; case "VBP": seenVBP = true; break; } } } int toAdd = 0; if ((seenVBN ^ seenVBD)) { // ^ is xor toAdd++; } if (seenVB ^ seenVBP) { toAdd++; } if (toAdd > 0) { int ind = tags.length; String[] newTags = new String[ind + toAdd]; System.arraycopy(tags, 0, newTags, 0, tags.length); if (seenVBN && ! seenVBD) { newTags[ind++] = "VBD"; } else if (seenVBD && ! seenVBN) { newTags[ind++] = "VBN"; } if (seenVB && ! seenVBP) { newTags[ind] = "VBP"; } else if (seenVBP && ! seenVB) { newTags[ind] = "VB"; } return newTags; } else { return tags; } } else { // no tag expansion for other languages currently return tags; } } @Override public String toString() { StringBuilder s = new StringBuilder(); s.append(index.toString()); s.append(' '); if (openFixed) { s.append(" OPEN:").append(getOpenTags()); } else { s.append(" open:").append(getOpenTags()).append(" CLOSED:").append(closed); } return s.toString(); } }