package edu.stanford.nlp.patterns.surface;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.patterns.DataInstance;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory;
import edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.TypesafeMap;
/**
* CanNOT handle overlapping labeled text (that is one token cannot belong to
* multiple labels)! Note that there has to be spaces around the tags <label>
* and </label> for the reader to work correctly!
*
* @author Sonal Gupta (sonalg@stanford.edu)
*
*/
public class AnnotatedTextReader {
public static Map<String, DataInstance> parseColumnFile(BufferedReader reader,
Set<String> categoriesAllowed,
Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels,
boolean setGoldClass, String sentIDprefix ){
CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
Properties props = new Properties();
SeqClassifierFlags flags = new SeqClassifierFlags(props);
flags.entitySubclassification = "noprefix";
flags.retainEntitySubclassification = false;
conllreader.init(flags);
Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);;
int num = -1;
Map<String, DataInstance> sents = new HashMap<>();
while(dociter.hasNext()){
List<CoreLabel> doc = dociter.next();
List<String> words = new ArrayList<>();
List<CoreLabel> sentcore = new ArrayList<>();
int tokenindex = 0;
for(CoreLabel l: doc){
if(l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")){
if(words.size() > 0){
num++;
String docid = sentIDprefix + "-"+String.valueOf(num);
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
words = new ArrayList<>();
sentcore = new ArrayList<>();
tokenindex = 0;
}
continue;
}
tokenindex ++;
words.add(l.word());
l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
String label = l.get(CoreAnnotations.AnswerAnnotation.class);
assert label != null : "label cannot be null";
l.set(CoreAnnotations.TextAnnotation.class, l.word());
l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
if (setGoldClass){
l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null
&& setClassForTheseLabels.containsKey(label))
l.set(setClassForTheseLabels.get(label), label);
sentcore.add(l);
}
if(words.size() > 0){
num++;
String docid = sentIDprefix + "-"+String.valueOf(num);;
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
}
}
return sents;
}
public static List<CoreMap> parseFile(
BufferedReader reader,
Set<String> categoriesAllowed,
Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels,
boolean setGoldClass, String sentIDprefix)
throws IOException {
Pattern startingLabelToken = Pattern.compile("<("
+ StringUtils.join(categoriesAllowed, "|") + ")>");
Pattern endLabelToken = Pattern.compile("</("
+ StringUtils.join(categoriesAllowed, "|") + ")>");
String backgroundSymbol = "O";
List<CoreMap> sentences = new ArrayList<>();
int lineNum = -1;
String l = null;
while ((l = reader.readLine()) != null) {
lineNum++;
String[] t = l.split("\t", 2);
String id = null;
String text = null;
if (t.length == 2) {
id = t[0];
text = t[1];
} else if (t.length == 1) {
text = t[0];
id = String.valueOf(lineNum);
}
id = sentIDprefix + id;
DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory
.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
dp.setTokenizerFactory(tokenizerFactory);
String label = backgroundSymbol;
int sentNum = -1;
for (List<HasWord> sentence : dp) {
sentNum++;
String sentStr = "";
List<CoreLabel> sent = new ArrayList<>();
for (HasWord tokw : sentence) {
String tok = tokw.word();
Matcher startingMatcher = startingLabelToken.matcher(tok);
Matcher endMatcher = endLabelToken.matcher(tok);
if (startingMatcher.matches()) {
//System.out.println("matched starting");
label = startingMatcher.group(1);
} else if (endMatcher.matches()) {
//System.out.println("matched end");
label = backgroundSymbol;
} else {
CoreLabel c = new CoreLabel();
List<String> toks = new ArrayList<>();
toks.add(tok);
for (String toksplit : toks) {
sentStr += " " + toksplit;
c.setWord(toksplit);
c.setLemma(toksplit);
c.setValue(toksplit);
c.set(CoreAnnotations.TextAnnotation.class, toksplit);
c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
if (setGoldClass){
c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null
&& setClassForTheseLabels.containsKey(label))
c.set(setClassForTheseLabels.get(label), label);
sent.add(c);
}
}
}
CoreMap sentcm = new ArrayCoreMap();
sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
sentences.add(sentcm);
}
}
return sentences;
}
}