package edu.stanford.nlp.international.spanish.process;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations.OriginalTextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.CoreAnnotations.ParentAnnotation;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.LexedTokenFactory;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
/**
* Tokenizer for raw Spanish text. This tokenization scheme is a derivative
* of PTB tokenization, but with extra rules for Spanish contractions and
* assimilations. It is based heavily on the FrenchTokenizer.
* <p>
* The tokenizer tokenizes according to the modified AnCora corpus tokenization
* standards, so the rules are a little different from PTB.
* </p>
* <p>
* A single instance of a Spanish Tokenizer is not thread safe, as it
* uses a non-threadsafe JFlex object to do the processing. Multiple
* instances can be created safely, though. A single instance of a
* SpanishTokenizerFactory is also not thread safe, as it keeps its
* options in a local variable.
* </p>
*
* @author Ishita Prasad
*/
public class SpanishTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(SpanishTokenizer.class);
// The underlying JFlex lexer
private final SpanishLexer lexer;
// Internal fields compound splitting
private final boolean splitCompounds;
private final boolean splitVerbs;
private final boolean splitContractions;
private final boolean splitAny;
private List<CoreLabel> compoundBuffer;
private SpanishVerbStripper verbStripper;
// Produces the tokenization for parsing used by AnCora (fixed) */
public static final String ANCORA_OPTIONS = "ptb3Ellipsis=true,normalizeParentheses=true,ptb3Dashes=false,splitAll=true";
/**
* Constructor.
*
* @param r
* @param tf
* @param lexerProperties
* @param splitCompounds
*/
public SpanishTokenizer(Reader r, LexedTokenFactory<T> tf, Properties lexerProperties, boolean splitCompounds, boolean splitVerbs, boolean splitContractions) {
lexer = new SpanishLexer(r, tf, lexerProperties);
this.splitCompounds = splitCompounds;
this.splitVerbs = splitVerbs;
this.splitContractions = splitContractions;
this.splitAny = (splitCompounds || splitVerbs || splitContractions);
if (splitAny) compoundBuffer = Generics.newArrayList(4);
if (splitVerbs) verbStripper = SpanishVerbStripper.getInstance();
}
@Override
@SuppressWarnings("unchecked")
protected T getNext() {
try {
T nextToken; // initialized in do-while
// Depending on the orthographic normalization options,
// some tokens can be obliterated. In this case, keep iterating
// until we see a non-zero length token.
do {
nextToken = (splitAny && ! compoundBuffer.isEmpty()) ?
(T) compoundBuffer.remove(0) :
(T) lexer.next();
} while (nextToken != null && nextToken.word().isEmpty());
// Check for compounds to split
if (splitAny && nextToken instanceof CoreLabel) {
CoreLabel cl = (CoreLabel) nextToken;
if (cl.containsKey(ParentAnnotation.class)) {
if(splitCompounds && cl.get(ParentAnnotation.class).equals(SpanishLexer.COMPOUND_ANNOTATION))
nextToken = (T) processCompound(cl);
else if (splitVerbs && cl.get(ParentAnnotation.class).equals(SpanishLexer.VB_PRON_ANNOTATION))
nextToken = (T) processVerb(cl);
else if (splitContractions && cl.get(ParentAnnotation.class).equals(SpanishLexer.CONTR_ANNOTATION))
nextToken = (T) processContraction(cl);
}
}
return nextToken;
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/** Copies the CoreLabel cl with the new word part */
private static CoreLabel copyCoreLabel(CoreLabel cl, String part, int beginPosition, int endPosition) {
CoreLabel newLabel = new CoreLabel(cl);
newLabel.setWord(part);
newLabel.setValue(part);
newLabel.setBeginPosition(beginPosition);
newLabel.setEndPosition(endPosition);
newLabel.set(OriginalTextAnnotation.class, part);
return newLabel;
}
private static CoreLabel copyCoreLabel(CoreLabel cl, String part, int beginPosition) {
return copyCoreLabel(cl, part, beginPosition, beginPosition + part.length());
}
/**
* Handles contractions like del and al, marked by the lexer
*
* del => de + l => de + el
* al => a + l => a + el
* con[mts]igo => con + [mts]i
*
*/
private CoreLabel processContraction(CoreLabel cl) {
cl.remove(ParentAnnotation.class);
String word = cl.word();
String first;
String second;
int secondOffset = 0, secondLength = 0;
String lowered = word.toLowerCase();
switch (lowered) {
case "del":
case "al":
first = word.substring(0, lowered.length() - 1);
char lastChar = word.charAt(lowered.length() - 1);
if (Character.isLowerCase(lastChar))
second = "el";
else second = "EL";
secondOffset = 1;
secondLength = lowered.length() - 1;
break;
case "conmigo":
case "consigo":
first = word.substring(0, 3);
second = word.charAt(3) + "í";
secondOffset = 3;
secondLength = 4;
break;
case "contigo":
first = word.substring(0, 3);
second = word.substring(3, 5);
secondOffset = 3;
secondLength = 4;
break;
default:
throw new IllegalArgumentException("Invalid contraction provided to processContraction");
}
int secondStart = cl.beginPosition() + secondOffset;
int secondEnd = secondStart + secondLength;
compoundBuffer.add(copyCoreLabel(cl, second, secondStart, secondEnd));
return copyCoreLabel(cl, first, cl.beginPosition(), secondStart);
}
/**
* Handles verbs with attached suffixes, marked by the lexer:
*
* Escribamosela => Escribamo + se + la => escribamos + se + la
* Sentaos => senta + os => sentad + os
* Damelo => da + me + lo
*
*/
private CoreLabel processVerb(CoreLabel cl) {
cl.remove(ParentAnnotation.class);
SpanishVerbStripper.StrippedVerb stripped = verbStripper.separatePronouns(cl.word());
if (stripped == null) {
return cl;
}
// Split the CoreLabel into separate labels, tracking changing begin + end
// positions.
int stemEnd = cl.beginPosition() + stripped.getOriginalStem().length();
int lengthRemoved = 0;
for (String pronoun : stripped.getPronouns()) {
int beginOffset = stemEnd + lengthRemoved;
compoundBuffer.add(copyCoreLabel(cl, pronoun, beginOffset));
lengthRemoved += pronoun.length();
}
CoreLabel stem = copyCoreLabel(cl, stripped.getStem(), cl.beginPosition(), stemEnd);
stem.setOriginalText(stripped.getOriginalStem());
return stem;
}
private static final Pattern pDash = Pattern.compile("\\-");
private static final Pattern pSpace = Pattern.compile("\\s+");
/**
* Splits a compound marked by the lexer.
*/
private CoreLabel processCompound(CoreLabel cl) {
cl.remove(ParentAnnotation.class);
String[] parts = pSpace.split(pDash.matcher(cl.word()).replaceAll(" - "));
int lengthAccum = 0;
for (String part : parts) {
CoreLabel newLabel = new CoreLabel(cl);
newLabel.setWord(part);
newLabel.setValue(part);
newLabel.setBeginPosition(cl.beginPosition() + lengthAccum);
newLabel.setEndPosition(cl.beginPosition() + lengthAccum + part.length());
newLabel.set(OriginalTextAnnotation.class, part);
compoundBuffer.add(newLabel);
lengthAccum += part.length();
}
return compoundBuffer.remove(0);
}
/**
* recommended factory method
*/
public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) {
return new SpanishTokenizerFactory<>(factory, options);
}
public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory) {
return new SpanishTokenizerFactory<>(factory, ANCORA_OPTIONS);
}
/**
* A factory for Spanish tokenizer instances.
*
* @author Spence Green
*
* @param <T>
*/
public static class SpanishTokenizerFactory<T extends HasWord> implements TokenizerFactory<T>, Serializable {
private static final long serialVersionUID = 946818805507187330L;
protected final LexedTokenFactory<T> factory;
protected Properties lexerProperties = new Properties();
protected boolean splitCompoundOption = false;
protected boolean splitVerbOption = false;
protected boolean splitContractionOption = false;
public static TokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() {
return new SpanishTokenizerFactory<>(new CoreLabelTokenFactory());
}
/**
* Constructs a new SpanishTokenizer that returns T objects and uses the options passed in.
*
* @param options a String of options, separated by commas
* @return A TokenizerFactory that returns the right token types
* @param factory a factory for the token type that the tokenizer will return
*/
public static <T extends HasWord> SpanishTokenizerFactory<T> newSpanishTokenizerFactory(
LexedTokenFactory<T> factory, String options) {
return new SpanishTokenizerFactory<>(factory, options);
}
// Constructors
/** Make a factory for SpanishTokenizers, default options */
private SpanishTokenizerFactory(LexedTokenFactory<T> factory) {
this.factory = factory;
}
/** Make a factory for SpanishTokenizers, options passed in */
private SpanishTokenizerFactory(LexedTokenFactory<T> factory, String options) {
this.factory = factory;
setOptions(options);
}
@Override
public Iterator<T> getIterator(Reader r) {
return getTokenizer(r);
}
@Override
public Tokenizer<T> getTokenizer(Reader r) {
return new SpanishTokenizer<>(r, factory, lexerProperties, splitCompoundOption, splitVerbOption, splitContractionOption);
}
/**
* Set underlying tokenizer options.
*
* @param options A comma-separated list of options
*/
@Override
public void setOptions(String options) {
if (options == null) return;
String[] optionList = options.split(",");
for (String option : optionList) {
String[] fields = option.split("=");
if (fields.length == 1) {
switch (fields[0]) {
case "splitAll":
splitCompoundOption = true;
splitVerbOption = true;
splitContractionOption = true;
break;
case "splitCompounds":
splitCompoundOption = true;
break;
case "splitVerbs":
splitVerbOption = true;
break;
case "splitContractions":
splitContractionOption = true;
break;
default:
lexerProperties.setProperty(option, "true");
break;
}
} else if (fields.length == 2) {
switch (fields[0]) {
case "splitAll":
splitCompoundOption = Boolean.valueOf(fields[1]);
splitVerbOption = Boolean.valueOf(fields[1]);
splitContractionOption = Boolean.valueOf(fields[1]);
break;
case "splitCompounds":
splitCompoundOption = Boolean.valueOf(fields[1]);
break;
case "splitVerbs":
splitVerbOption = Boolean.valueOf(fields[1]);
break;
case "splitContractions":
splitContractionOption = Boolean.valueOf(fields[1]);
break;
default:
lexerProperties.setProperty(fields[0], fields[1]);
break;
}
} else {
System.err.printf("%s: Bad option %s%n", this.getClass().getName(), option);
}
}
}
@Override
public Tokenizer<T> getTokenizer(Reader r, String extraOptions) {
setOptions(extraOptions);
return getTokenizer(r);
}
} // end static class SpanishTokenizerFactory
/**
* Returns a tokenizer with Ancora tokenization.
*/
public static TokenizerFactory<CoreLabel> ancoraFactory() {
TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
tf.setOptions(ANCORA_OPTIONS);
return tf;
}
/**
* a factory that vends CoreLabel tokens with default tokenization.
*/
public static TokenizerFactory<CoreLabel> coreLabelFactory() {
return SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
}
public static TokenizerFactory<CoreLabel> factory() {
return coreLabelFactory();
}
private static String usage() {
StringBuilder sb = new StringBuilder();
String nl = System.lineSeparator();
sb.append(String.format("Usage: java %s [OPTIONS] < file%n%n", SpanishTokenizer.class.getName()));
sb.append("Options:").append(nl);
sb.append(" -help : Print this message.").append(nl);
sb.append(" -ancora : Tokenization style of AnCora (fixed).").append(nl);
sb.append(" -lowerCase : Apply lowercasing.").append(nl);
sb.append(" -encoding type : Encoding format.").append(nl);
sb.append(" -options str : Orthographic options (see SpanishLexer.java)").append(nl);
sb.append(" -tokens : Output tokens as line-separated instead of space-separated.").append(nl);
sb.append(" -onePerLine : Output tokens one per line.").append(nl);
return sb.toString();
}
private static Map<String,Integer> argOptionDefs() {
Map<String,Integer> argOptionDefs = Generics.newHashMap();
argOptionDefs.put("help", 0);
argOptionDefs.put("ftb", 0);
argOptionDefs.put("ancora", 0);
argOptionDefs.put("lowerCase", 0);
argOptionDefs.put("encoding", 1);
argOptionDefs.put("options", 1);
argOptionDefs.put("tokens", 0);
return argOptionDefs;
}
/**
* A fast, rule-based tokenizer for Spanish based on AnCora.
* Performs punctuation splitting and light tokenization by default.
* <p>
* Currently, this tokenizer does not do line splitting. It assumes that the input
* file is delimited by the system line separator. The output will be equivalently
* delimited.
* </p>
*
* @param args
*/
public static void main(String[] args) {
final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
if (options.containsKey("help")) {
log.info(usage());
return;
}
// Lexer options
final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
String orthoOptions = options.containsKey("ancora") ? ANCORA_OPTIONS : "";
if (options.containsKey("options")) {
orthoOptions = orthoOptions.isEmpty() ? options.getProperty("options") : orthoOptions + ',' + options;
}
final boolean tokens = PropertiesUtils.getBool(options, "tokens", false);
if ( ! tokens) {
orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
}
tf.setOptions(orthoOptions);
// Other options
final String encoding = options.getProperty("encoding", "UTF-8");
final boolean toLower = PropertiesUtils.getBool(options, "lowerCase", false);
final Locale es = new Locale("es");
boolean onePerLine = PropertiesUtils.getBool(options, "onePerLine", false);
// Read the file from stdin
int nLines = 0;
int nTokens = 0;
final long startTime = System.nanoTime();
try {
Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding));
boolean printSpace = false;
while (tokenizer.hasNext()) {
++nTokens;
String word = tokenizer.next().word();
if (word.equals(SpanishLexer.NEWLINE_TOKEN)) {
++nLines;
System.out.println();
if ( ! onePerLine) {
printSpace = false;
}
} else {
String outputToken = toLower ? word.toLowerCase(es) : word;
if (onePerLine) {
System.out.println(outputToken);
} else {
if (printSpace) {
System.out.print(" ");
}
System.out.print(outputToken);
printSpace = true;
}
}
}
} catch (UnsupportedEncodingException e) {
throw new RuntimeIOException("Bad character encoding", e);
}
long elapsedTime = System.nanoTime() - startTime;
double linesPerSec = (double) nLines / (elapsedTime / 1e9);
System.err.printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
} // end main()
}