/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.examples.cpe; /* ******************************************************************************************* * N O T E : The XML format (XCAS) that this Cas Consumer outputs, is eventually * being superceeded by the more standardized and compact XMI format. However * it is used currently as the expected form for remote services, and there is * existing tooling for doing stand-alone component development and debugging * that uses this format to populate an initial CAS. So it is not * deprecated yet; it is also being kept for compatibility with older versions. * * New code should consider using the XmiWriterCasConsumer where possible, * which uses the current XMI format for XML externalizations of the CAS ******************************************************************************************* */ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.impl.XCASSerializer; import org.apache.uima.collection.CasConsumer_ImplBase; import org.apache.uima.examples.SourceDocumentInformation; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceProcessException; import org.apache.uima.util.XMLSerializer; import org.xml.sax.SAXException; /** * A simple CAS consumer that generates XCAS (XML representation of the CAS) files in the * filesystem. * <p> * This CAS Consumer takes one parameters: * <ul> * <li><code>OutputDirectory</code> - path to directory into which output files will be written</li> * </ul> * * */ public class XCasWriterCasConsumer extends CasConsumer_ImplBase { /** * Name of configuration parameter that must be set to the path of a directory into which the * output files will be written. */ public static final String PARAM_OUTPUTDIR = "OutputDirectory"; private File mOutputDir; private int mDocNum; public void initialize() throws ResourceInitializationException { mDocNum = 0; mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR)); if (!mOutputDir.exists()) { mOutputDir.mkdirs(); } } /** * Processes the CasContainer which was populated by the TextAnalysisEngines. <br> * In this case, the CAS is converted to XML and written into the output file . * * @param aCAS * CasContainer which has been populated by the TAEs * * @throws ResourceProcessException * if there is an error in processing the Resource * * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS) */ public void processCas(CAS aCAS) throws ResourceProcessException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new ResourceProcessException(e); } // retreive the filename of the input file from the CAS FSIterator it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator(); File outFile = null; if (it.hasNext()) { SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next(); File inFile; try { inFile = new File(new URL(fileLoc.getUri()).getPath()); String outFileName = inFile.getName(); if (fileLoc.getOffsetInSource() > 0) { outFileName += fileLoc.getOffsetInSource(); } outFile = new File(mOutputDir, outFileName); } catch (MalformedURLException e1) { // invalid URL, use default processing below } } if (outFile == null) { outFile = new File(mOutputDir, "doc" + mDocNum++); } // serialize XCAS and write to output file try { writeXCas(jcas.getCas(), outFile); } catch (IOException e) { throw new ResourceProcessException(e); } catch (SAXException e) { throw new ResourceProcessException(e); } } /** * Serialize a CAS to a file in XCAS format * * @param aCas * CAS to serialize * @param name * output file * * @throws IOException * if an I/O failure occurs * @throws SAXException * if an error occurs generating the XML text */ private void writeXCas(CAS aCas, File name) throws IOException, SAXException { FileOutputStream out = null; try { out = new FileOutputStream(name); XCASSerializer ser = new XCASSerializer(aCas.getTypeSystem()); XMLSerializer xmlSer = new XMLSerializer(out, false); ser.serialize(aCas, xmlSer.getContentHandler()); } finally { if (out != null) { out.close(); } } } }