/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package uima.components.collection; import java.io.File; import java.io.IOException; import java.util.ArrayList; import org.apache.uima.UimaContext; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceConfigurationException; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.FileUtils; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import types.SourceDocumentInformation; /** * A simple collection reader that reads documents from a directory in the * filesystem. It can be configured with the following parameters: * <ul> * <li><code>InputDirectory</code> - path to directory containing files</li> * <li><code>Encoding</code> (optional) - character encoding of the input files</li> * <li><code>Language</code> (optional) - language of the input documents</li> * </ul> * * */ public class FileSystemCollectionReader extends JCasCollectionReader_ImplBase { /** * Name of configuration parameter that must be set to the path of a * directory containing input files. */ public static final String PARAM_INPUTDIR = "InputDirectory"; /** * Name of configuration parameter that contains the character encoding used * by the input files. If not specified, the default system encoding will be * used. */ public static final String PARAM_ENCODING = "Encoding"; /** * Name of optional configuration parameter that contains the language of * the documents in the input directory. If specified this information will * be added to the CAS. */ public static final String PARAM_LANGUAGE = "Language"; /** * Name of optional configuration parameter that indicates including the * subdirectories (recursively) of the current input directory. */ public static final String PARAM_SUBDIR = "BrowseSubdirectories"; private ArrayList<File> mFiles; private String mEncoding; private String mLanguage; private Boolean mRecursive; private int mCurrentIndex; /** * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() */ @Override public void initialize(UimaContext uimaContext) { System.out.println(PARAM_INPUTDIR + ": " + getConfigParameterValue(PARAM_INPUTDIR)); // String filePath = ((String) // getConfigParameterValue(PARAM_INPUTDIR)).trim(); // InputStream iStream = Files.newDirectoryStream(new Path(filePath)); File directory = new File( ((String) getConfigParameterValue(PARAM_INPUTDIR)).trim()); // StreamUtil.stream2file(iStream); mEncoding = (String) getConfigParameterValue(PARAM_ENCODING); mLanguage = (String) getConfigParameterValue(PARAM_LANGUAGE); mRecursive = (Boolean) getConfigParameterValue(PARAM_SUBDIR); if (null == mRecursive) { // could be null if not set, it is optional mRecursive = Boolean.FALSE; } mCurrentIndex = 0; // if input directory does not exist or is not a directory, throw // exception if (!directory.exists() || !directory.isDirectory()) { try { throw new ResourceInitializationException( ResourceConfigurationException.DIRECTORY_NOT_FOUND, new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() }); } catch (ResourceInitializationException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // get list of files in the specified directory, and subdirectories if // the // parameter PARAM_SUBDIR is set to True mFiles = new ArrayList<File>(); addFilesFromDir(directory); } /** * This method adds files in the directory passed in as a parameter to * mFiles. If mRecursive is true, it will include all files in all * subdirectories (recursively), as well. * * @param dir */ private void addFilesFromDir(File dir) { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory()) { mFiles.add(files[i]); } else if (mRecursive) { addFilesFromDir(files[i]); } } } /** * @see org.apache.uima.collection.CollectionReader#hasNext() */ public boolean hasNext() { return mCurrentIndex < mFiles.size(); } /** * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ public void getNext(JCas aCAS) throws IOException, CollectionException { JCas jcas = aCAS; // open input stream to file File file = (File) mFiles.get(mCurrentIndex++); String text = FileUtils.file2String(file, mEncoding); // put document in CAS jcas.setDocumentText(text); // set language if it was explicitly specified as a configuration // parameter if (mLanguage != null) { jcas.setDocumentLanguage(mLanguage); } // Also store location of source document in CAS. This information is // critical // if CAS Consumers will need to know where the original document // contents are located. // For example, the Semantic Search CAS Indexer writes this information // into the // search index that it creates, which allows applications that use the // search index to // locate the documents that satisfy their semantic queries. SourceDocumentInformation srcDocInfo = new SourceDocumentInformation( jcas); srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString()); srcDocInfo.setOffsetInSource(0); srcDocInfo.setDocumentSize((int) file.length()); srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); srcDocInfo.addToIndexes(); } /** * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close() */ public void close() throws IOException { } /** * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress() */ public Progress[] getProgress() { return new Progress[] { new ProgressImpl(mCurrentIndex, mFiles.size(), Progress.ENTITIES) }; } /** * Gets the total number of documents that will be returned by this * collection reader. This is not part of the general collection reader * interface. * * @return the number of documents in the collection */ public int getNumberOfDocuments() { return mFiles.size(); } }