/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.examples.casMultiplier; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; import org.apache.uima.cas.FSIterator; import org.apache.uima.examples.SourceDocumentInformation; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; /** * An example CasMultiplier, which breaks large text documents into smaller segments. The minimum * size of the segments as determined by the "SegmentSize" configuration parameter, but the break * between segments will always occur at the next newline character, so segments will not be exactly * that size. */ public class SimpleTextSegmenter extends JCasMultiplier_ImplBase { private String mDoc; private int mPos; private int mSegmentSize; private String mDocUri; /* * (non-Javadoc) * * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext) */ public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); mSegmentSize = ((Integer) aContext.getConfigParameterValue("SegmentSize")).intValue(); } /* * (non-Javadoc) * * @see JCasMultiplier_ImplBase#process(JCas) */ public void process(JCas aJCas) throws AnalysisEngineProcessException { mDoc = aJCas.getDocumentText(); mPos = 0; // retreive the filename of the input file from the CAS so that it can be added // to each segment FSIterator it = aJCas.getAnnotationIndex(SourceDocumentInformation.type).iterator(); if (it.hasNext()) { SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next(); mDocUri = fileLoc.getUri(); } else { mDocUri = null; } } /* * (non-Javadoc) * * @see org.apache.uima.analysis_component.AnalysisComponent#hasNext() */ public boolean hasNext() throws AnalysisEngineProcessException { return mPos < mDoc.length(); } /* * (non-Javadoc) * * @see org.apache.uima.analysis_component.AnalysisComponent#next() */ public AbstractCas next() throws AnalysisEngineProcessException { int breakAt = mPos + mSegmentSize; if (breakAt > mDoc.length()) breakAt = mDoc.length(); // search for the next newline character. Note: this example segmenter implementation // assumes that the document contains many newlines. In the worst case, if this segmenter // is runon a document with no newlines, it will produce only one segment containing the // entire document text. A better implementation might specify a maximum segment size as // well as a minimum. while (breakAt < mDoc.length() && mDoc.charAt(breakAt - 1) != '\n') breakAt++; JCas jcas = getEmptyJCas(); try { jcas.setDocumentText(mDoc.substring(mPos, breakAt)); // if original CAS had SourceDocumentInformation, also add SourceDocumentInformatio // to each segment if (mDocUri != null) { SourceDocumentInformation sdi = new SourceDocumentInformation(jcas); sdi.setUri(mDocUri); sdi.setOffsetInSource(mPos); sdi.setDocumentSize(breakAt - mPos); sdi.addToIndexes(); if (breakAt == mDoc.length()) { sdi.setLastSegment(true); } } mPos = breakAt; return jcas; } catch (Exception e) { jcas.release(); throw new AnalysisEngineProcessException(e); } } }