package sensim;
import com.google.common.base.Charsets;
import dima.UIMAXMLConverterHelper;
import org.apache.commons.io.IOUtils;
import org.apache.pig.EvalFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.builtin.OutputSchema;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.uima.UIMAException;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
/**
* Date: 4/12/14
* Time: 11:35 PM
*
* @author Priska Herger
*/
@OutputSchema("sentence:tuple (pair:chararray, sentence:chararray)")
public class PairSentenceSelector extends EvalFunc<Tuple> {
private final JCas jCas;
private UIMAXMLConverterHelper uimaXMLConverterHelper;
private TupleFactory tupleFactory = TupleFactory.getInstance();
public PairSentenceSelector() throws UIMAException {
super();
jCas = JCasFactory.createJCas();
uimaXMLConverterHelper = new UIMAXMLConverterHelper(false);
}
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0 || input.get(0) == null) {
return null;
}
try {
Tuple pair = (Tuple) input.get(0);
CharSequence charseq = (CharSequence) input.get(1);
InputStream stream = IOUtils.toInputStream(charseq, Charsets.UTF_8.name());
// note that jCas is changed in deserialize(...) and contains different data upon return!
// design decision in favor of speed at the expense of readability
uimaXMLConverterHelper.deserialize(stream, jCas);
String sentence = jCas.getDocumentText();
Tuple tuple = tupleFactory.newTuple(2);
tuple.set(0, pair);
tuple.set(1, sentence);
return tuple;
} catch (UIMAException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
}
return null;
}
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
return super.getArgToFuncMapping();
}
}