package edu.stanford.nlp.sequences; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.CoreTokenFactory; import edu.stanford.nlp.util.*; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; import java.io.Reader; import java.util.*; import java.util.regex.Pattern; /** * Version of ColumnDocumentReaderAndWriter that doesn't read in entire file and * stores it in memory before parsing it. * * Reads in one line at a time. Assumes that sequences are broken up by empty * lines. * * Also differs from ColumnDocumentReaderAndWriter in following ways: * <ul> * <li>Splits on tabs (delimiterPattern)</li> * <li>Replaces within field whitespaces with "_" (replaceWhitespace)</li> * <li>Assumes that a line with just one column and starts * with "* xxxxx" indicates the document id (hasDocId)</li> * </ul> * * Accepts the following properties * <table> * <tr><th>Field</th><th>Type</th><th>Default</th><th>Description</th></tr> * <tr><td>{@code columns}</td><td>String</td><td>{@code}</td><td>Comma separated list of mapping between annotation (see {@link edu.stanford.nlp.ling.AnnotationLookup}) and column index (starting from 0). Example: {@code word=0,tag=1}</td></tr> * <tr><td>{@code delimiter}</td><td>String</td><td>{@code \t}</td><td>Regular expression for delimiter</td></tr> * <tr><td>{@code replaceWhitespace}</td><td>Boolean</td><td>{@code true}</td><td>Replace whitespaces with "_"</td></tr> * <tr><td>{@code tokens}</td><td>Class</td> * <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation}</td> * <td>Annotation field for tokens</td></tr> * <tr><td>{@code tokenFactory}</td><td>Class</td> * <td>{@link CoreLabelTokenFactory edu.stanford.nlp.process.CoreLabelTokenFactory}</td> * <td>Factory for creating tokens</td></tr> * </table> * * @author Angel Chang * @author Sonal Gupta (made the class generic) */ public class ColumnTabDocumentReaderWriter<IN extends CoreMap> implements DocumentReaderAndWriter<IN> { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ColumnTabDocumentReaderWriter.class); private static final long serialVersionUID = 1; private String[] map; // = null; private Pattern delimiterPattern = Pattern.compile("\t"); private Pattern whitespacePattern = Pattern.compile("\\s"); private boolean replaceWhitespace = true; private String tokensAnnotationClassName; private CoreTokenFactory<IN> tokenFactory; /** * reads the tokenFactory and tokensAnnotationClassName from * {@link SeqClassifierFlags} */ @Override public void init(SeqClassifierFlags flags) { if (flags.tokensAnnotationClassName != null) { this.tokensAnnotationClassName = flags.tokensAnnotationClassName; } else { this.tokensAnnotationClassName = "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"; } if (flags.tokenFactory != null) { try { this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(flags.tokenFactory).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } } else { this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory(); } init(flags, this.tokenFactory, this.tokensAnnotationClassName); } public void init(Properties props) { init("", props); } public void init(String name, Properties props) { String prefix = (name == null)? "":name + "."; String delimiterRegex = props.getProperty(prefix + "delimiter"); if (delimiterRegex != null) { delimiterPattern = Pattern.compile(delimiterRegex); } replaceWhitespace = PropertiesUtils.getBool(props, prefix + "replaceWhitespace", replaceWhitespace); String mapString = props.getProperty(prefix + "columns"); tokensAnnotationClassName = props.getProperty(prefix + "tokens", "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"); String tokenFactoryClassName = props.getProperty(prefix + "tokenFactory"); if (tokenFactoryClassName != null) { try { this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(tokenFactoryClassName).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } } else { this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory(); } init(mapString, this.tokenFactory, this.tokensAnnotationClassName); } public void init(String map) { init(map, (CoreTokenFactory<IN>) new CoreLabelTokenFactory(), "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"); } public void init(SeqClassifierFlags flags, CoreTokenFactory<IN> tokenFactory, String tokensAnnotationClassName) { this.map = StringUtils.mapStringToArray(flags.map); this.tokenFactory = tokenFactory; this.tokensAnnotationClassName = tokensAnnotationClassName; } public void init(String map, CoreTokenFactory<IN> tokenFactory, String tokensAnnotationClassName) { this.map = StringUtils.mapStringToArray(map); this.tokenFactory = tokenFactory; this.tokensAnnotationClassName = tokensAnnotationClassName; } public Iterator<List<IN>> getIterator(Reader r) { BufferedReader br; if (r instanceof BufferedReader) { br = (BufferedReader) r; } else { br = new BufferedReader(r); } return new BufferedReaderIterator<>(new ColumnDocBufferedGetNextTokens(br)); } public Iterator<Annotation> getDocIterator(Reader r) { BufferedReader br; if (r instanceof BufferedReader) { br = (BufferedReader) r; } else { br = new BufferedReader(r); } return new BufferedReaderIterator<>(new ColumnDocBufferedGetNext(br, false)); } public Iterator<Annotation> getDocIterator(Reader r, boolean includeText) { BufferedReader br; if (r instanceof BufferedReader) { br = (BufferedReader) r; } else { br = new BufferedReader(r); } return new BufferedReaderIterator<>(new ColumnDocBufferedGetNext(br, false, includeText)); } private interface GetNextFunction<E> { E getNext(); } private static class BufferedReaderIterator<E> extends AbstractIterator<E> { E nextItem; GetNextFunction<E> getNextFunc; public BufferedReaderIterator(GetNextFunction<E> getNextFunc) { this.getNextFunc = getNextFunc; this.nextItem = getNextFunc.getNext(); } public boolean hasNext() { return nextItem != null; }; public E next() { if (nextItem == null) { throw new NoSuchElementException(); } E item = nextItem; nextItem = getNextFunc.getNext(); return item; } } private class ColumnDocBufferedGetNextTokens<IN extends CoreMap> implements GetNextFunction<List<IN>> { ColumnDocBufferedGetNext docGetNext; public ColumnDocBufferedGetNextTokens(BufferedReader br) { docGetNext = new ColumnDocBufferedGetNext(br, true); } @Override public List<IN> getNext() { try { CoreMap m = docGetNext.getNext(); Class tokensAnnotationClass = Class.forName(tokensAnnotationClassName); return (List<IN>) ((m != null) ? m.get(tokensAnnotationClass) : null); } catch (ClassNotFoundException e) { e.printStackTrace(); } return null; } } private static <IN extends CoreMap> String join(Iterable<IN> l, Class textKey, String glue) { StringBuilder sb = new StringBuilder(); for (IN o : l) { if (sb.length() > 0) { sb.append(glue); } sb.append(o.get(textKey)); } return sb.toString(); } private class ColumnDocBufferedGetNext implements GetNextFunction<Annotation> { private BufferedReader br; boolean includeText = false; boolean keepBoundaries = false; boolean returnTokensOnEmptyLine = true; boolean hasDocId = true; boolean hasDocStart = false; String docId; String newDocId; int itemCnt = 0; int lineCnt = 0; public ColumnDocBufferedGetNext(BufferedReader br) { this(br, true, false); } public ColumnDocBufferedGetNext(BufferedReader br, boolean returnSegmentsAsDocs) { this(br, returnSegmentsAsDocs, false); } public ColumnDocBufferedGetNext(BufferedReader br, boolean returnSegmentsAsDocs, boolean includeText) { this.br = br; this.includeText = includeText; if (returnSegmentsAsDocs) { keepBoundaries = false; returnTokensOnEmptyLine = true; hasDocStart = false; } else { keepBoundaries = true; returnTokensOnEmptyLine = false; hasDocStart = true; } } private Annotation createDoc(String docId, List<IN> tokens, List<IntPair> sentenceBoundaries, boolean includeText) { try { String docText = includeText ? join(tokens, CoreAnnotations.TextAnnotation.class, " ") : null; Annotation doc = new Annotation(docText); doc.set(CoreAnnotations.DocIDAnnotation.class, docId); Class tokensClass = Class.forName(tokensAnnotationClassName); doc.set(tokensClass, tokens); boolean setTokenCharOffsets = includeText; if (setTokenCharOffsets) { int i = 0; for (IN token : tokens) { String tokenText = token.get(CoreAnnotations.TextAnnotation.class); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i); i += tokenText.length(); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, i); /* * if (i > docText.length()) { log.info("index " + i + * " larger than docText length " + docText.length()); * log.info("Token: " + tokenText); * log.info("DocText: " + docText); } */ assert (i <= docText.length()); i++; // Skip space } } if (sentenceBoundaries != null) { List<CoreMap> sentences = new ArrayList<>(sentenceBoundaries.size()); for (IntPair p : sentenceBoundaries) { // get the sentence text from the first and last character offsets List<IN> sentenceTokens = new ArrayList<>(tokens.subList(p.getSource(), p.getTarget() + 1)); Integer begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int last = sentenceTokens.size() - 1; Integer end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); String sentenceText = includeText ? join(sentenceTokens, CoreAnnotations.TextAnnotation.class, " ") : null; // create a sentence annotation with text and token offsets Annotation sentence = new Annotation(sentenceText); sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); sentence.set(tokensClass, sentenceTokens); sentence.set(CoreAnnotations.TokenBeginAnnotation.class, p.getSource()); sentence.set(CoreAnnotations.TokenEndAnnotation.class, p.getTarget() + 1); int sentenceIndex = sentences.size(); sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex); // add the sentence to the list sentences.add(sentence); } // add the sentences annotations to the document doc.set(CoreAnnotations.SentencesAnnotation.class, sentences); } return doc; } catch (ClassNotFoundException e) { e.printStackTrace(System.err); } return null; } private void markBoundary(List<IN> words, List<IntPair> boundaries) { if (words != null && !words.isEmpty()) { int curWordIndex = words.size() - 1; if (boundaries.isEmpty()) { boundaries.add(new IntPair(0, curWordIndex)); } else { int lastWordIndex = boundaries.get(boundaries.size() - 1).getTarget(); if (lastWordIndex < curWordIndex) { boundaries.add(new IntPair(lastWordIndex + 1, curWordIndex)); } } } } @Override public Annotation getNext() { if (itemCnt > 0 && itemCnt % 1000 == 0) { log.info("[" + itemCnt + "," + lineCnt + "]"); if (itemCnt % 10000 == 9000) { log.info(); } } try { String line; List<IN> words = null; List<IntPair> boundaries = null; if (keepBoundaries) { boundaries = new ArrayList<>(); } while ((line = br.readLine()) != null) { lineCnt++; line = line.trim(); if (line.length() != 0) { String[] info = delimiterPattern.split(line); if (replaceWhitespace) { for (int i = 0; i < info.length; i++) { info[i] = whitespacePattern.matcher(info[i]).replaceAll("_"); } } if (hasDocId && line.startsWith("* ") && info.length == 1) { newDocId = line.substring(2); if (words != null) { return createDoc(docId, words, boundaries, includeText); } } else if (hasDocStart && "-DOCSTART-".equals(info[0])) { newDocId = "doc" + itemCnt; if (words != null) { if (keepBoundaries) { markBoundary(words, boundaries); } return createDoc(docId, words, boundaries, includeText); } } else { if (words == null) { words = new ArrayList<>(); docId = newDocId; itemCnt++; } IN wi; if (info.length == map.length) { wi = tokenFactory.makeToken(map, info); } else { wi = tokenFactory.makeToken(map, Arrays.asList(info).subList(0, map.length).toArray(new String[map.length])); } words.add(wi); } } else { if (returnTokensOnEmptyLine && words != null) { if (keepBoundaries) { markBoundary(words, boundaries); } return createDoc(docId, words, boundaries, includeText); } else if (keepBoundaries) { markBoundary(words, boundaries); } } } if (words == null) { log.info("[" + itemCnt + "," + lineCnt + "]"); } if (keepBoundaries) { markBoundary(words, boundaries); } return (words == null) ? null : createDoc(docId, words, boundaries, includeText); } catch (IOException ex) { log.info("IOException: " + ex); throw new RuntimeException(ex); } } } // end class ColumnDocParser @Override public void printAnswers(List<IN> doc, PrintWriter out) { for (IN wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class); String tokenStr = StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)); out.println(tokenStr + "\t" + goldAnswer + "\t" + answer); } out.println(); } }