package edu.stanford.nlp.process; import java.io.File; import java.net.URL; import java.util.ArrayList; import java.util.List; import edu.stanford.nlp.ling.BasicDocument; import edu.stanford.nlp.ling.Document; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.ling.Word; /** * Transforms a Document of Words into a document all or partly of * TaggedWords by breaking words on a tag divider character. * * @author Teg Grenager (grenager@stanford.edu) * @author Christopher Manning * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization) * * @param <L> The type of the labels * @param <F> The type of the features */ public class WordToTaggedWordProcessor<IN extends HasWord, L, F> extends AbstractListProcessor<IN, HasWord, L, F> { /** * The char that we will split on. */ protected char splitChar; /** * Returns a new Document where each Word with a tag has been converted * to a TaggedWord. Things in the input which don't implement HasWord * will be deleted in the output. Things which do will be scanned for * being word + splitChar + tag. If they are, they are split up and * inserted as TaggedWords, otherwise they are added to the document * with their current type. More precisely, they will be split on the * last instance of splitChar with index above 0. This will give the * correct split, providing tags don't include the splitChar, regardless * of escaping, and will not allow an empty or null word - you can think * of the first character as always being escaped. * * @param words The input Document (should be of HasWords) * @return A new Document, perhaps with some of the things TaggedWords */ public List<HasWord> process(List<? extends IN> words) { List<HasWord> result = new ArrayList<>(); for (HasWord w : words) { result.add(splitTag(w)); } return result; } /** * Splits the Word w on the character splitChar. */ private HasWord splitTag(HasWord w) { if (splitChar == 0) { return w; } String s = w.word(); int split = s.lastIndexOf(splitChar); if (split <= 0) { // == 0 isn't allowed - no empty words! return w; } String word = s.substring(0, split); String tag = s.substring(split + 1, s.length()); return new TaggedWord(word, tag); } /** * Create a <code>WordToTaggedWordProcessor</code> using the default * forward slash character to split on. */ public WordToTaggedWordProcessor() { this('/'); } /** * Flexibly set the tag splitting chars. A splitChar of 0 is * interpreted to mean never split off a tag. * * @param splitChar The character at which to split */ public WordToTaggedWordProcessor(char splitChar) { this.splitChar = splitChar; } /** * This will print out some text, recognizing tags. It can be used to * test tag breaking. <br> Usage: <code> * java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl * </code> * * @param args Command line argument: a file or URL */ public static void main(String[] args) { if (args.length != 1) { System.out.println("usage: java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl"); System.exit(0); } String filename = args[0]; try { Document<HasWord, Word, Word> d; if (filename.startsWith("http://")) { Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename)); DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>(); d = notags.processDocument(dpre); } else { d = new BasicDocument<HasWord>().init(new File(filename)); } DocumentProcessor<Word, HasWord, HasWord, Word> proc = new WordToTaggedWordProcessor<>(); Document<HasWord, Word, HasWord> sentd = proc.processDocument(d); // System.out.println(sentd); int i = 0; for (HasWord w : sentd) { System.out.println(i + ": " + w); i++; } } catch (Exception e) { e.printStackTrace(); } } }