package sensim; import com.google.common.base.Charsets; import com.google.common.collect.Lists; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NN; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import dima.UIMAXMLConverterHelper; import org.apache.commons.io.IOUtils; import org.apache.pig.EvalFunc; import org.apache.pig.FuncSpec; import org.apache.pig.builtin.OutputSchema; import org.apache.pig.data.*; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.xml.sax.SAXException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * Date: 11/19/13 * Time: 1:57 AM * * @author Priska Herger */ @OutputSchema("sentences:bag {sentence:tuple (noun1:chararray, noun2:chararray, parse:chararray)}") public class NounPairLabeler extends EvalFunc<DataBag> { private final JCas jCas; private final String language; private UIMAXMLConverterHelper uimaXMLConverterHelper; private BagFactory bagFactory = BagFactory.getInstance(); private TupleFactory tupleFactory = TupleFactory.getInstance(); public NounPairLabeler(String language) throws UIMAException { super(); this.language = language; uimaXMLConverterHelper = new UIMAXMLConverterHelper(false); jCas = JCasFactory.createJCas(); } @Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0 || input.get(0) == null || input.get(1) == null) { return null; } DataBag dataBag = bagFactory.newDefaultBag(); try { long parseId = (Long) input.get(0); CharSequence charseq = (CharSequence) input.get(1); InputStream stream = IOUtils.toInputStream(charseq, Charsets.UTF_8.name()); // note that jCas is changed in deserialize(...) and contains different data upon return! // design decision in favor of speed at the expense of readability uimaXMLConverterHelper.deserialize(stream, jCas); Iterator<Sentence> sentences = JCasUtil.iterator(jCas, Sentence.class); while (sentences.hasNext()) { Sentence sentence = sentences.next(); //ArrayList<Token> tokens = Lists.newArrayList(JCasUtil.select(jCas, Token.class)); List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); ArrayList<Token> nouns = Lists.newArrayList(); for (Token t : tokens) { if (t.getPos() instanceof NN) { nouns.add(t); } // need this for the PukWac POS tags ( unfortunately ) else if (t.getPos().getPosValue().equals("NN") || t.getPos().getPosValue().equals("NNS")) { nouns.add(t); } } if (nouns == null || nouns.size() == 1) { return null; } // get all pairs of nouns omitting incestuous and duplicate pairs List<List<Token>> entityPairs = Lists.newArrayList(); //TODO: consider including inversed pairs here; else include them later in pipeline! for (int i = 0; i < nouns.size(); i++) { for (int j = i + 1; j < nouns.size(); j++) { entityPairs.add(Lists.newArrayList(nouns.get(i), nouns.get(j))); } } for (List<Token> pair : entityPairs) { Token n1 = pair.get(0); Token n2 = pair.get(1); Tuple tuple = tupleFactory.newTuple(3); // output: first noun \t second noun \t parseId tuple.set(0, (n1).getLemma().getValue()); tuple.set(1, (n2).getLemma().getValue()); tuple.set(2, parseId); dataBag.add(tuple); } } return dataBag; } catch (AnalysisEngineProcessException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (UIMAException e) { e.printStackTrace(); } return null; } @Override public List<FuncSpec> getArgToFuncMapping() throws FrontendException { return super.getArgToFuncMapping(); } }