package edu.stanford.nlp.util; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.simple.Sentence; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Optional; import static edu.stanford.nlp.util.TSVUtils.unescapeSQL; /** * Reads sentences from a TSV, provided a list of fields to populate. */ public class TSVSentenceIterator implements Iterator<Sentence> { /** A list of possible fields in the sentence table */ public enum SentenceField { ID, DEPENDENCIES_BASIC, DEPENDENCIES_COLLAPSED, DEPENDENCIES_COLLAPSED_CC, DEPENDENCIES_ALTERNATE, WORDS, LEMMAS, POS_TAGS, NER_TAGS, DOC_ID, SENTENCE_INDEX, CORPUS_ID, DOC_CHAR_BEGIN, DOC_CHAR_END, GLOSS, IGNORE; // Ignore this field. public boolean isToken() { switch(this) { case WORDS: case LEMMAS: case POS_TAGS: case NER_TAGS: return true; default: return false; } } } private final Iterator<List<String>> source; private final List<SentenceField> fields; public TSVSentenceIterator(Iterator<List<String>> recordSource, List<SentenceField> fields) { this.source = recordSource; this.fields = fields; } /** * Populates the fields of a sentence * @param fields * @param entries * @return */ public static Sentence toSentence(List<SentenceField> fields, List<String> entries) { return new Sentence(toCoreMap(fields, entries)); } public static CoreMap toCoreMap(List<SentenceField> fields, List<String> entries) { CoreMap map = new ArrayCoreMap(fields.size()); Optional<List<CoreLabel>> tokens = Optional.empty(); // First pass - process all token level stuff. for(Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) { SentenceField field = entry.first; String value = unescapeSQL(entry.second); switch (field) { case WORDS: { List<String> values = TSVUtils.parseArray(value); if (!tokens.isPresent()) { tokens = Optional.of(new ArrayList<>(values.size())); for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel()); } int beginChar = 0; for (int i = 0; i < values.size(); i++) { tokens.get().get(i).setValue(values.get(i)); tokens.get().get(i).setWord(values.get(i)); tokens.get().get(i).setBeginPosition(beginChar); tokens.get().get(i).setEndPosition(beginChar + values.get(i).length()); beginChar += values.get(i).length() + 1; } } break; case LEMMAS: { List<String> values = TSVUtils.parseArray(value); if (!tokens.isPresent()) { tokens = Optional.of(new ArrayList<>(values.size())); for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel()); } for (int i = 0; i < values.size(); i++) { tokens.get().get(i).setLemma(values.get(i)); } } break; case POS_TAGS: { List<String> values = TSVUtils.parseArray(value); if (!tokens.isPresent()) { tokens = Optional.of(new ArrayList<>(values.size())); for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel()); } for (int i = 0; i < values.size(); i++) { tokens.get().get(i).setTag(values.get(i)); } } break; case NER_TAGS: { List<String> values = TSVUtils.parseArray(value); if (!tokens.isPresent()) { tokens = Optional.of(new ArrayList<>(values.size())); for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel()); } for (int i = 0; i < values.size(); i++) { tokens.get().get(i).setNER(values.get(i)); } } break; default: // ignore. break; } } // Document specific stuff. Optional<String> docId = Optional.empty(); Optional<String> sentenceId = Optional.empty(); Optional<Integer> sentenceIndex = Optional.empty(); for(Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) { SentenceField field = entry.first; String value = unescapeSQL(entry.second); switch (field) { case ID: sentenceId = Optional.of(value); break; case DOC_ID: docId = Optional.of(value); break; case SENTENCE_INDEX: sentenceIndex = Optional.of(Integer.parseInt(value)); break; case GLOSS: value = value.replace("\\n", "\n").replace("\\t", "\t"); map.set(CoreAnnotations.TextAnnotation.class, value); break; default: // ignore. break; } } // High level document stuff map.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceId.orElse("-1")); map.set(CoreAnnotations.DocIDAnnotation.class, docId.orElse("???")); map.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(0)); // Doc-char if(tokens.isPresent()) { for (Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) { SentenceField field = entry.first; String value = unescapeSQL(entry.second); switch (field) { case DOC_CHAR_BEGIN: { List<String> values = TSVUtils.parseArray(value); for (int i = 0; i < tokens.get().size(); i++) { tokens.get().get(i).setBeginPosition(Integer.parseInt(values.get(i))); } } break; case DOC_CHAR_END: { List<String> values = TSVUtils.parseArray(value); for (int i = 0; i < tokens.get().size(); i++) { tokens.get().get(i).setEndPosition(Integer.parseInt(values.get(i))); } } break; default: // ignore. break; } } } // Final token level stuff. if (tokens.isPresent()) { for (int i = 0; i < tokens.get().size(); i++) { tokens.get().get(i).set(CoreAnnotations.DocIDAnnotation.class, docId.orElse("???")); tokens.get().get(i).set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1)); tokens.get().get(i).set(CoreAnnotations.IndexAnnotation.class, i+1); tokens.get().get(i).set(CoreAnnotations.TokenBeginAnnotation.class, i); tokens.get().get(i).set(CoreAnnotations.TokenEndAnnotation.class, i+1); } } // Dependency trees if(tokens.isPresent()) { map.set(CoreAnnotations.TokensAnnotation.class, tokens.get()); map.set(CoreAnnotations.TokenBeginAnnotation.class, 0); map.set(CoreAnnotations.TokenEndAnnotation.class, tokens.get().size()); for (Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) { SentenceField field = entry.first; String value = unescapeSQL(entry.second); switch (field) { case DEPENDENCIES_BASIC: { SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get()); map.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph); // if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class)) // map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph); // if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class)) // map.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph); } break; case DEPENDENCIES_COLLAPSED: { SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get()); map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph); } break; case DEPENDENCIES_COLLAPSED_CC: { SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get()); // if (!map.containsKey(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)) // map.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph); // map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph); map.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph); } break; case DEPENDENCIES_ALTERNATE: { SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get()); map.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, graph); } break; default: // ignore. break; } } } return map; } @Override public boolean hasNext() { return source.hasNext(); } @Override public Sentence next() { return toSentence(fields, source.next()); } }