package sensim; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordLemmatizer; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizer; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPosTagger; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter; import org.apache.pig.EvalFunc; import org.apache.pig.FuncSpec; import org.apache.pig.builtin.OutputSchema; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.UIMAException; import org.apache.uima.resource.ResourceInitializationException; import org.xml.sax.SAXException; import java.io.IOException; import java.util.List; import dima.UIMAXMLConverterHelper; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; /** * Date: 4/10/13 * Time: 1:51 PM * * @author Priska Herger * <p/> * Description: An annotator class that takes raw text as input * and processes it through a core natural language pipeline. */ @OutputSchema("annotations:chararray") public class CoreNLPAnnotator extends EvalFunc<String> { private final JCas jCas; private final AnalysisEngine engine; private final UIMAXMLConverterHelper uimaXMLConverterHelper; private final String language; public CoreNLPAnnotator(String language) { super(); this.language = language; try { AnalysisEngineDescription aggregate = createEngineDescription( createEngineDescription(StanfordSegmenter.class), createEngineDescription(StanfordPosTagger.class, StanfordPosTagger.PARAM_LANGUAGE, language, StanfordPosTagger.PARAM_VARIANT, "wsj-0-18-left3words-distsim"), createEngineDescription(StanfordLemmatizer.class), createEngineDescription(StanfordParser.class, StanfordParser.PARAM_LANGUAGE, language, StanfordParser.PARAM_WRITE_PENN_TREE, true, StanfordParser.PARAM_WRITE_POS, false, // already done in PosTagger above StanfordParser.PARAM_PRINT_TAGSET, true, StanfordParser.PARAM_VARIANT, "pcfg") ); engine = AnalysisEngineFactory.createEngine(aggregate); jCas = engine.newJCas(); uimaXMLConverterHelper = new UIMAXMLConverterHelper(false); } catch (ResourceInitializationException e) { throw new IllegalArgumentException(e); } catch (UIMAException e) { throw new IllegalArgumentException(e); } } @Override public String exec(Tuple input) throws IOException { if (input == null || input.size() == 0 || input.get(0) == null) { return null; } try { String sentence = (String) input.get(0); jCas.reset(); jCas.setDocumentText(sentence); jCas.setDocumentLanguage(language); engine.process(jCas); return uimaXMLConverterHelper.serialize(jCas); } catch (AnalysisEngineProcessException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } return null; } @Override public List<FuncSpec> getArgToFuncMapping() throws FrontendException { return super.getArgToFuncMapping(); } }