/*
* Licensed to Think Big Analytics, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Think Big Analytics, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2011 Think Big Analytics. All Rights Reserved.
*/
package tap.formats.text;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import tap.Pipe;
import tap.formats.FileFormat;
import tap.formats.Formats;
@SuppressWarnings("deprecation")
public class TextFormat extends FileFormat {
@Override
public void setupOutput(JobConf conf, Class<?> ignore) {
conf.setOutputFormat(TextOutputFormat.class);
conf.setOutputKeyClass(String.class);
}
@Override
public void setupInput(JobConf conf, Class<?> ignore) {
conf.setInputFormat(TextInputFormat.class);
}
@Override
public String fileExtension() {
return ".txt";
}
@Override
public void setPipeFormat(Pipe pipe) {
pipe.setFormat(Formats.STRING_FORMAT);
pipe.setPrototype(new String(""));
}
@Override
public boolean isCompatible(InputFormat format) {
return (format instanceof TextInputFormat);
}
/**
* Should contain one new line and range of characters are printable
*/
@Override
public boolean signature(byte[] header) {
int recordDeliminatorCount = 0;
boolean hasPrintableOnly = false;
if (header[0] == 0xFF) {
// Unicode ?
} else {
for (byte b : header) {
if (b == '\n') {
recordDeliminatorCount++;
continue;
} else if (b >= 32 && b < 127 || b == '\r' || b == '\t') {
if (recordDeliminatorCount >= 2) {
// done after reaching two records worth of scanning
break;
}
continue;
}
hasPrintableOnly = false;
break;
}
hasPrintableOnly = true;
}
return (recordDeliminatorCount > 0) && hasPrintableOnly;
}
@Override
public boolean instanceOfCheck(Object o) {
return o instanceof String || o instanceof Text;
}
}