SimpleTextSegmenter.java example

Explorer
uima_prolog-master
- uima-PrologInterface-Examples
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.uima.examples.casMultiplier;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

/**
 * An example CasMultiplier, which breaks large text documents into smaller segments. The minimum
 * size of the segments as determined by the "SegmentSize" configuration parameter, but the break
 * between segments will always occur at the next newline character, so segments will not be exactly
 * that size.
 */
public class SimpleTextSegmenter extends JCasMultiplier_ImplBase {
  private String mDoc;

  private int mPos;

  private int mSegmentSize;

  private String mDocUri;

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
   */
  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    mSegmentSize = ((Integer) aContext.getConfigParameterValue("SegmentSize")).intValue();
  }

  /*
   * (non-Javadoc)
   * 
   * @see JCasMultiplier_ImplBase#process(JCas)
   */
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    mDoc = aJCas.getDocumentText();
    mPos = 0;
    // retreive the filename of the input file from the CAS so that it can be added
    // to each segment
    FSIterator it = aJCas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
    if (it.hasNext()) {
      SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next();
      mDocUri = fileLoc.getUri();
    } else {
      mDocUri = null;
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.uima.analysis_component.AnalysisComponent#hasNext()
   */
  public boolean hasNext() throws AnalysisEngineProcessException {
    return mPos < mDoc.length();
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.uima.analysis_component.AnalysisComponent#next()
   */
  public AbstractCas next() throws AnalysisEngineProcessException {
    int breakAt = mPos + mSegmentSize;
    if (breakAt > mDoc.length())
      breakAt = mDoc.length();
    // search for the next newline character. Note: this example segmenter implementation
    // assumes that the document contains many newlines. In the worst case, if this segmenter
    // is runon a document with no newlines, it will produce only one segment containing the
    // entire document text. A better implementation might specify a maximum segment size as
    // well as a minimum.
    while (breakAt < mDoc.length() && mDoc.charAt(breakAt - 1) != '\n')
      breakAt++;

    JCas jcas = getEmptyJCas();
    try {
      jcas.setDocumentText(mDoc.substring(mPos, breakAt));
      // if original CAS had SourceDocumentInformation, also add SourceDocumentInformatio
      // to each segment
      if (mDocUri != null) {
        SourceDocumentInformation sdi = new SourceDocumentInformation(jcas);
        sdi.setUri(mDocUri);
        sdi.setOffsetInSource(mPos);
        sdi.setDocumentSize(breakAt - mPos);
        sdi.addToIndexes();

        if (breakAt == mDoc.length()) {
          sdi.setLastSegment(true);
        }
      }

      mPos = breakAt;
      return jcas;
    } catch (Exception e) {
      jcas.release();
      throw new AnalysisEngineProcessException(e);
    }
  }

}