/*******************************************************************************
* Copyright 2010,2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.bigdata.io.hadoop;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.Text;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasConsumer_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
/**
* Write CASes to HDFS sequence files
*
* @author Richard Eckart de Castilho, Hans-Peter Zorn
*/
public class CASWritableSequenceFileWriter extends JCasConsumer_ImplBase {
/**
* The folder to write the generated XMI files to.
*/
public static final String PARAM_PATH = ComponentParameters.PARAM_TARGET_LOCATION;
@ConfigurationParameter(name = PARAM_PATH, mandatory = true)
private File path;
/**
* Location to write the type system to. If this is not set, a file called
* typesystem.xml will be written to the XMI output path. If this is set, it
* is expected to be a file relative to the current work directory or an
* absolute file. <br>
* If this parameter is set, the {@link #PARAM_COMPRESS} parameter has no
* effect on the type system. Instead, if the file name ends in ".gz", the
* file will be compressed, otherwise not.
*/
public static final String PARAM_TYPE_SYSTEM_FILE = "TypeSystemFile";
@ConfigurationParameter(name = PARAM_TYPE_SYSTEM_FILE, mandatory = false)
private File typeSystemFile;
/**
* Enabled/disable hadoop compression. If this is set, all files will have
* the ".gz" ending.
*/
public static final String PARAM_COMPRESS = "Compress";
@ConfigurationParameter(name = PARAM_COMPRESS, mandatory = true, defaultValue = "false")
private boolean compress;
public static final String PARAM_FS = "HadoopFs";
@ConfigurationParameter(name = PARAM_FS, mandatory = true, defaultValue = "hdfs://10.130.21.11:8020")
private String fileSystemName;
private Writer writer;
private int i = 0;
private CASWritable casWritable;;
@SuppressWarnings("deprecation")
@Override
public void initialize(org.apache.uima.UimaContext context)
throws org.apache.uima.resource.ResourceInitializationException {
super.initialize(context);
final Configuration conf = new Configuration(false);
this.casWritable = new BinCasWritable();
this.path = new File(
(String) context.getConfigParameterValue(PARAM_PATH));
conf.set("fs.default.name", this.fileSystemName);
// Compress Map output
if (this.compress) {
System.out.println("compressing");
conf.set("mapred.output.compress", "true");
conf.set("mapred.map.output.compression.codec",
"org.apache.hadoop.io.compress.SnappyCodec");
} else
conf.set("mapred.output.compress", "false");
final String filename = this.path + "/" + "part-00000";
try {
final FileSystem fs = FileSystem.get(URI.create(filename), conf);
final Path path = new Path(URI.create(filename).toString());
this.writer = SequenceFile.createWriter(fs, conf, path, Text.class,
BinCasWritable.class);
} catch (final IOException e) {
throw new ResourceInitializationException(e);
}
};
@Override
public void collectionProcessComplete()
throws AnalysisEngineProcessException {
try {
this.writer.close();
} catch (final IOException e) {
// TODO Auto-generated catch block
throw new AnalysisEngineProcessException();
}
};
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
final DocumentMetaData meta = DocumentMetaData.get(aJCas);
String documentKey = createKeyFromDocument(meta);
try {
casWritable.setCAS(aJCas.getCas());
this.writer.append(new Text(documentKey), casWritable);
// TypeSystemUtil.typeSystem2TypeSystemDescription(aJCas.getTypeSystem()).toXML(typeOS);
} catch (final Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
/**
* Try to figure our a meaningful key from the document metadata
*
* @param meta
* @param baseUri
* @param docUri
* @return
*/
private String createKeyFromDocument(final DocumentMetaData meta) {
final String baseUri = meta.getDocumentBaseUri();
final String docUri = meta.getDocumentUri();
String relativeDocumentPath = "doc_" + this.i++;
if (baseUri != null) {
if ((docUri == null) || !docUri.startsWith(baseUri)) {
throw new IllegalStateException("Base URI [" + baseUri
+ "] is not a prefix of document URI [" + docUri + "]");
}
relativeDocumentPath = docUri.substring(baseUri.length());
} else {
if (meta.getDocumentId() == null) {
relativeDocumentPath = meta.getDocumentTitle(); // TODO: Bad
// Hack!
} else {
relativeDocumentPath = meta.getDocumentId();
}
}
return relativeDocumentPath;
}
}