/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cogroo.uima.readers.probi; import java.io.File; import java.io.IOException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.collection.CollectionReader_ImplBase; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.DocumentAnnotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import cogroo.uima.GoldenGrammarError; import cogroo.uima.GoldenParagraph; import cogroo.uima.GoldenSentence; public class ProbiCollectionReader extends CollectionReader_ImplBase { public static final String PARAM_INPUT = "InputFile"; /** * Name of configuration parameter that contains the character encoding used * by the input files. If not specified, the default system encoding will be * used. */ public static final String PARAM_ENCODING = "Encoding"; /** * Name of optional configuration parameter that contains the language of the * documents in the input directory. If specified this information will be * added to the CAS. */ public static final String PARAM_LANGUAGE = "Language"; private String mEncoding; private String mLanguage; private ProbiParser mParser; private int mDocs; private ProbiEntry mLastEntry; private int mDocCount = 0; private String mCat; /** * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() */ public void initialize() throws ResourceInitializationException { File input = new File( ((String) getConfigParameterValue(PARAM_INPUT)).trim()); mEncoding = (String) getConfigParameterValue(PARAM_ENCODING); mLanguage = (String) getConfigParameterValue(PARAM_LANGUAGE); try { mParser = new ProbiParser(input, mEncoding); mLastEntry = mParser.read(); } catch (Exception e) { throw new RuntimeException(e); } } /** * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } int paragraphs = 0; // mLastEntry = mParser.read(); StringBuilder text = new StringBuilder(); String idPrefix = "PROBI_"; if (mCat != null) { idPrefix += mCat + "_"; } idPrefix += mDocCount + "-"; while (mLastEntry != null && sameCat(mLastEntry.getCategory(), mCat) && paragraphs < 100) { int start = text.length(); text.append(mLastEntry.getSentence()); int end = text.length(); text.append("\n\n"); GoldenParagraph p = new GoldenParagraph(jcas); p.setId(paragraphs); p.setBegin(start); p.setEnd(end); p.addToIndexes(); GoldenSentence s = new GoldenSentence(jcas); s.setId(idPrefix + paragraphs); s.setBegin(start); s.setEnd(end); if (mLastEntry.isContainsError()) { FSArray fsarr = new FSArray(jcas, 1); GoldenGrammarError ge = new GoldenGrammarError(jcas); ge.setBegin(start); ge.setEnd(end); ge.setCategory(mLastEntry.getCategory()); // ge.setError(""); // ge.setReplace(grers.get(j).getRep()); ge.addToIndexes(); fsarr.set(0, ge); s.setGoldenGrammarErrors(fsarr); } s.addToIndexes(); paragraphs++; mLastEntry = mParser.read(); } // put document in CAS jcas.setDocumentText(text.toString()); // set language if it was explicitly specified as a configuration parameter if (mLanguage != null) { ((DocumentAnnotation) jcas.getDocumentAnnotationFs()) .setLanguage(mLanguage); } if (mLastEntry != null) { mDocCount++; // mLastEntry = mParser.read(); mCat = mLastEntry.getCategory(); } // Also store location of source document in CAS. This information is // critical // if CAS Consumers will need to know where the original document contents // are located. // For example, the Semantic Search CAS Indexer writes this information into // the // search index that it creates, which allows applications that use the // search index to // locate the documents that satisfy their semantic queries. // SourceDocumentInformation srcDocInfo = new // SourceDocumentInformation(jcas); // srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString()); // srcDocInfo.setOffsetInSource(0); // srcDocInfo.setDocumentSize((int) file.length()); // srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); // srcDocInfo.addToIndexes(); } /** * @see org.apache.uima.collection.CollectionReader#hasNext() */ public boolean hasNext() { return mLastEntry != null; } private boolean sameCat(String category, String cat) { if (cat == null && category == null) { return true; } if (cat != null) { return cat.equals(category); } return false; } /** * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close() */ public void close() throws IOException { } /** * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress() */ public Progress[] getProgress() { return new Progress[] { new ProgressImpl(mDocs, -1, Progress.ENTITIES, true) /* , new ProgressImpl(mCurrentIndex,mFiles.size(),Progress.ENTITIES) */}; } }