package edu.stanford.nlp.tagger.maxent;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.File;
import java.io.Serializable;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Keeps track of a distributional similarity mapping, i.e., a map from
* word to class. Returns strings to save time, since that is how the
* results are used in the tagger.
*/
public class Distsim implements Serializable {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(Distsim.class);
// Avoid loading the same lexicon twice but allow different lexicons
// TODO: when loading a distsim, should we populate this map?
private static final Map<String,Distsim> lexiconMap = Generics.newHashMap();
private final Map<String,String> lexicon;
private final String unk;
private boolean mapdigits; // = false
private boolean casedDistSim; // = false;
private static final Pattern digits = Pattern.compile("[0-9]");
/**
* The Extractor argument extraction keeps ; together, so we use
* that to delimit options. Actually, the only option supported is
* mapdigits, which tells the Distsim to try mapping [0-9] to 0 and
* requery for an unknown word with digits.
*/
public Distsim(String path) {
String[] pieces = path.split(";");
String filename = pieces[0];
for (int arg = 1; arg < pieces.length; ++arg) {
if (pieces[arg].equalsIgnoreCase("mapdigits")) {
mapdigits = true;
} else if (pieces[arg].equalsIgnoreCase("casedDistSim")) {
casedDistSim = true;
} else {
throw new IllegalArgumentException("Unknown argument " + pieces[arg]);
}
}
lexicon = Generics.newHashMap();
// todo [cdm 2016]: Note that this loads file with default file encoding rather than specifying it
for (String word : ObjectBank.getLineIterator(new File(filename))) {
String[] bits = word.split("\\s+");
String w = bits[0];
if ( ! casedDistSim) {
w = w.toLowerCase();
}
lexicon.put(w, bits[1]);
}
if (lexicon.containsKey("<unk>")) {
unk = lexicon.get("<unk>");
} else {
unk = "null";
}
}
public static Distsim initLexicon(String path) {
synchronized (lexiconMap) {
Distsim lex = lexiconMap.get(path);
if (lex == null) {
Timing timer = new Timing();
lex = new Distsim(path);
lexiconMap.put(path, lex);
timer.done(log, "Loading distsim lexicon from " + path);
}
return lex;
}
}
/**
* Returns the cluster for the given word as a string. If the word
* is not found, but the Distsim contains default numbers and the
* word contains the digits 0-9, the default number is returned if
* found. If the word is still unknown, the unknown word is
* returned ("null" if no other unknown word was specified).
*/
public String getMapping(String word) {
String distSim = lexicon.get(word.toLowerCase());
if (distSim == null && mapdigits) {
Matcher matcher = digits.matcher(word);
if (matcher.find()) {
distSim = lexicon.get(matcher.replaceAll("0"));
}
}
if (distSim == null) {
distSim = unk;
}
return distSim;
}
private static final long serialVersionUID = 2L;
}