SimpleTextMerger.java example

Explorer
uima_prolog-master
- uima-PrologInterface-Examples
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.uima.examples.casMultiplier;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.CasCopier;

/**
 * An example CasMultiplier, which merges text documents into larger ones. It attempts to merge all
 * of the segments that came from one original artifact. This is done by checking the "lastSegment"
 * feature of the SourceDocumentInformation FeatureStructure, which is expected to be populated by
 * the CollectionReader or CasMultiplier that produced the input CASes.
 * <p>
 * Limitations: if the lastSegment feature is never set to true by the component producing the input
 * CASes, the merger will never produce any output. Also, this implementation relies on the CASes
 * arriving in order, which could be a problem in a mulithreaded framework implementation. The order
 * requirement could be relieved by recording a segment number in the SourceDocumentInformation, but
 * that would also make this example more complicated.
 */
public class SimpleTextMerger extends JCasMultiplier_ImplBase {
  
  public static final String MESSAGE_DIGEST = "org.apache.uima.examples.casMultiplier.ExampleCasMultiplierMessages";
  
  public static final String MISSING_SOURCE_DOCUMENT_INFO = "missing_source_document_info";
  
  public static final String NO_NEXT_CAS = "no_next_cas";
  
  private StringBuffer mDocBuf = new StringBuffer();

  private JCas mMergedCas;

  private boolean mReadyToOutput = false;

  private String[] mAnnotationTypesToCopy;

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
   */
  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    mAnnotationTypesToCopy = (String[]) aContext.getConfigParameterValue("AnnotationTypesToCopy");
  }

  /*
   * (non-Javadoc)
   * 
   * @see JCasMultiplier_ImplBase#process(JCas)
   */
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    // procure a new CAS if we don't have one already
    if (mMergedCas == null) {
      mMergedCas = getEmptyJCas();
    }

    // append document text
    String docText = aJCas.getDocumentText();
    int prevDocLen = mDocBuf.length();
    mDocBuf.append(docText);

    // copy specified annotation types
    CasCopier copier = new CasCopier(aJCas.getCas(), mMergedCas.getCas());
    Set copiedIndexedFs = new HashSet(); // needed in case one annotation is in two indexes (could
    // happen if specified annotation types overlap)
    for (int i = 0; i < mAnnotationTypesToCopy.length; i++) {
      Type type = mMergedCas.getTypeSystem().getType(mAnnotationTypesToCopy[i]);
      FSIndex index = aJCas.getCas().getAnnotationIndex(type);
      Iterator iter = index.iterator();
      while (iter.hasNext()) {
        FeatureStructure fs = (FeatureStructure) iter.next();
        if (!copiedIndexedFs.contains(fs)) {
          Annotation copyOfFs = (Annotation) copier.copyFs(fs);
          // update begin and end
          copyOfFs.setBegin(copyOfFs.getBegin() + prevDocLen);
          copyOfFs.setEnd(copyOfFs.getEnd() + prevDocLen);
          mMergedCas.addFsToIndexes(copyOfFs);
          copiedIndexedFs.add(fs);
        }
      }
    }

    // get the SourceDocumentInformation FS, which indicates the sourceURI of the document
    // and whether the incoming CAS is the last segment
    FSIterator it = aJCas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
    if (!it.hasNext()) {
      throw new AnalysisEngineProcessException(MESSAGE_DIGEST, MISSING_SOURCE_DOCUMENT_INFO,
              new Object[0]);
    }
    SourceDocumentInformation sourceDocInfo = (SourceDocumentInformation) it.next();
    if (sourceDocInfo.getLastSegment()) {
      // time to produce an output CAS
      // set the document text
      mMergedCas.setDocumentText(mDocBuf.toString());

      // add source document info to destination CAS
      SourceDocumentInformation destSDI = new SourceDocumentInformation(mMergedCas);
      destSDI.setUri(sourceDocInfo.getUri());
      destSDI.setOffsetInSource(0);
      destSDI.setLastSegment(true);
      destSDI.addToIndexes();

      mDocBuf = new StringBuffer();
      mReadyToOutput = true;
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.uima.analysis_component.AnalysisComponent#hasNext()
   */
  public boolean hasNext() throws AnalysisEngineProcessException {
    return mReadyToOutput;
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.uima.analysis_component.AnalysisComponent#next()
   */
  public AbstractCas next() throws AnalysisEngineProcessException {
    if (!mReadyToOutput) {
      throw new AnalysisEngineProcessException(MESSAGE_DIGEST, NO_NEXT_CAS, new Object[0]);
    }
    JCas casToReturn = mMergedCas;
    mMergedCas = null;
    mReadyToOutput = false;
    return casToReturn;
  }

}