package sensim;
import com.google.common.base.Joiner;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NN;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NN_Type;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import dima.UIMAXMLConverterHelper;
import org.apache.pig.EvalFunc;
import org.apache.pig.builtin.OutputSchema;
import org.apache.pig.data.Tuple;
import org.apache.uima.UIMAException;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Date: 4/11/14
* Time: 5:51 PM
*
* @author Priska Herger
*
* Customization of the Conll2006Reader.java from DKPro.
*
*/
@OutputSchema("annotations:chararray")
public class PukwacReader extends EvalFunc<String> {
private final JCas jCas;
private UIMAXMLConverterHelper uimaXMLConverterHelper;
private final String language;
private static final int TOKEN = 0;
private static final int LEMMA = 1;
private static final int POS = 2;
private static final int ID = 3;
private static final int HEAD = 4;
private static final int DEP = 5;
public PukwacReader(String language) throws UIMAException {
super();
this.language = language;
jCas = JCasFactory.createJCas();
uimaXMLConverterHelper = new UIMAXMLConverterHelper(false);
}
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0 || input.get(0) == null) {
return null;
}
jCas.reset();
JCasBuilder doc = new JCasBuilder(jCas);
try {
String conllSentence = (String) input.get(0);
List<String[]> words = readSentence(conllSentence);
if(words == null || words.size() == 0) {
return null;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
// add the text to doc ( unsure if this is the best way to do it )
String[] sentenceArray = new String[words.size()];
for(int i =0; i < words.size(); i++) {
sentenceArray[i] = words.get(i)[TOKEN];
}
String satz = Joiner.on(" ").join(sentenceArray);
doc.getJCas().setDocumentText(satz);
doc.getJCas().setDocumentLanguage(this.language);
// process tokens, lemmas, POS tags
Map<Integer, Token> tokens = new HashMap<Integer, Token>();
for (String[] word : words) {
// process token
Token token = doc.add(word[TOKEN], Token.class);
tokens.put(Integer.valueOf(word[ID]), token);
doc.add(" ");
// process lemma
Lemma lemma = new Lemma(doc.getJCas(), token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.addToIndexes();
token.setLemma(lemma);
// process part-of-speech tag
POS pos = new POS(doc.getJCas(), token.getBegin(), token.getEnd());
pos.setPosValue(word[POS]);
pos.addToIndexes();
token.setPos(pos);
// process high-level DKPro types
// s.a. https://code.google.com/p/dkpro-core-asl/wiki/ResourceProviderAPI
if(word[POS].equals("NN") || word[POS].equals("NNS")) {
NN nn = new NN(doc.getJCas(), token.getBegin(), token.getEnd());
nn.setPosValue(word[POS]);
nn.addToIndexes();
}
token.addToIndexes();
lemma.addToIndexes();
pos.addToIndexes();
sentenceEnd = token.getEnd();
}
// process dependencies
for(String[] word : words) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
// // model the root as a loop onto itself
// if (govId == 0) {
// //govId = depId;
// }
// don't model ROOT as a loop, would cause loops in later graphs
if(word[DEP].equals("ROOT")) {
continue;
}
Dependency dep = new Dependency(doc.getJCas());
dep.setGovernor(tokens.get(govId));
dep.setDependent(tokens.get(depId));
dep.setDependencyType(word[DEP]);
dep.setBegin(dep.getDependent().getBegin());
dep.setEnd(dep.getDependent().getEnd());
//dep.setBegin(Math.min(dep.getDependent().getBegin(), dep.getGovernor().getBegin()));
//dep.setEnd(Math.max(dep.getDependent().getEnd(), dep.getGovernor().getEnd()));
dep.addToIndexes();
}
// process sentence
Sentence sentence = new Sentence(doc.getJCas(), sentenceBegin, sentenceEnd);
sentence.addToIndexes();
//// make it one sentence per line
//doc.add("\n");
//doc.close(); // throws an exception: "org.apache.uima.cas.CASRuntimeException: Data for Sofa feature setLocalSofaData() has already been set."
return uimaXMLConverterHelper.serialize(doc.getJCas());
//return uimaXMLConverterHelper.serialize(jCas) // ??
} catch (SAXException e) {
e.printStackTrace();
}
return null;
}
private List<String[]> readSentence(String conllSentence) throws IOException {
List<String[]> words = new ArrayList<String[]>();
String[] conllLines = conllSentence.split( "\t:::::\t" );
for(String conllLine : conllLines) {
String[] fields = conllLine.split("\t");
if (fields.length != 6) {
//throw new IOException("Invalid file format: Every word must have 6 tab-separted fields.");
return null;
}
words.add(fields);
}
return words;
}
}