/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.fangorn; import static org.apache.uima.fit.util.JCasUtil.select; import java.io.File; import java.io.IOException; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import au.edu.unimelb.csse.ParseException; import au.edu.unimelb.csse.analyser.Node; import au.edu.unimelb.csse.analyser.NodeTreebankAnalyser; import au.edu.unimelb.csse.analyser.OverflowException; import au.edu.unimelb.csse.analyser.String2NodesParser; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; /** * Fangorn index writer. */ @MimeTypeCapability({MimeTypes.APPLICATION_X_FANGORN}) @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree"}) public class FangornWriter extends JCasConsumer_ImplBase { public static final String FIELD_FANGORN = "sent"; public static final String FIELD_COLLECTION_ID = "collectionId"; public static final String FIELD_DOCUMENT_ID = "documentId"; public static final String FIELD_BEGIN = "begin"; public static final String FIELD_END = "end"; /** * Location to which the output is written. */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File outputFolder; private IndexWriter writer; private NodeTreebankAnalyser analyser; private final String2NodesParser parser = new String2NodesParser(); @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); analyser = new NodeTreebankAnalyser(false); try { writer = new IndexWriter(outputFolder, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED); } catch (IOException e) { throw new ResourceInitializationException(e); } } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { DocumentMetaData meta = DocumentMetaData.get(aJCas); for (PennTree s : select(aJCas, PennTree.class)) { Node root; try { root = parser.parse(s.getPennTree()); } catch (ParseException e) { getContext().getLogger().log(Level.SEVERE, ExceptionUtils.getRootCauseMessage(e)); continue; } String asJson = root.asJSONString(); Document d = new Document(); d.add(new Field("documentId", meta.getDocumentId(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO)); d.add(new Field("collectionId", meta.getCollectionId(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO)); d.add(new Field("begin", Integer.toString(s.getBegin()), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO)); d.add(new Field("end", Integer.toString(s.getEnd()), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO)); d.add(new Field("sent", asJson, Field.Store.COMPRESS, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); try { writer.addDocument(d); } catch (OverflowException e) { getContext().getLogger().log(Level.SEVERE, ExceptionUtils.getRootCauseMessage(e)); continue; } catch (Exception e) { throw new AnalysisEngineProcessException(e); } } } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (writer != null) { try { writer.close(); } catch (IOException e) { // Ignore exception on close } } } }