/*******************************************************************************
* Copyright 2012,2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.bigdata.io.hadoop;
import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.impl.XCASDeserializer;
import org.apache.uima.util.CasCreationUtils;
/**
* Converts the old Text-based CAS Files to CASWritables
*
* @author zorn
*
*/
public class FormatConverterMapper
implements Mapper<Text, Text, Text, CASWritable>
{
@Override
public void configure(JobConf job)
{
// TODO Auto-generated method stub
}
@Override
public void close()
throws IOException
{
// TODO Auto-generated method stub
}
@Override
public void map(Text key, Text value, OutputCollector<Text, CASWritable> output,
Reporter reporter)
throws IOException
{
try {
CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null);
XCASDeserializer.deserialize(IOUtils.toInputStream(value.toString(), "UTF-8"), cas);
// XCASDeserializer.deserialize(IOUtils.toInputStream(value.toString(), "utf8"), cas);
CASWritable casWritable = new BinCasWithTypeSystemWritable();
casWritable.setCAS(cas);
output.collect(key, casWritable);
reporter.incrCounter("hpz", "processed cas", 1);
if (cas.getDocumentText() == null)
reporter.incrCounter("hpz", "document text null", 1);
else
reporter.incrCounter("hpz", "doc size", cas.getDocumentText().length());
}
catch (Exception e) {
reporter.incrCounter("hpz", "exception " + e.getMessage(), 1);
e.printStackTrace(System.err);
}
}
}