/*
* chombo: Hadoop Map Reduce utility
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.chombo.transformer;
import java.util.regex.Matcher;
/**
* Extracts one or more fields from unstructured field
* @author pranab
*
*/
public class UnstructuredFieldExtractor {
private RawAttributeSchema rawSchema;
private boolean failOnInvalid;
private Matcher matcher;
private String[] extractedAttrs;
/**
* @param rawSchema
* @param failOnInvalid
*/
public UnstructuredFieldExtractor(RawAttributeSchema rawSchema, boolean failOnInvalid) {
this.rawSchema = rawSchema;
this.failOnInvalid = failOnInvalid;
}
/**
* @param rawAttrIndex
* @param rawAttr
* @param derivedAttr
* @param offset
* @return
*/
public int extractAttributes(int rawAttrIndex, String rawAttr, String[]derivedAttr, int offset) {
RawAttribute rawAttrMeta = rawSchema.findAttribute(rawAttrIndex);
extractAttributes(rawAttr, rawAttrMeta);
if (null != extractedAttrs) {
for (int i = 0; i < extractedAttrs.length; ++i) {
derivedAttr[offset+i] = extractedAttrs[i];
}
}
return null != extractedAttrs ? extractedAttrs.length : 0;
}
/**
* @param rawAttr
* @return
*/
private String[] extractAttributes(String rawAttr, RawAttribute rawAttrMeta) {
extractedAttrs = new String[rawAttrMeta.getNumDerivedAttributes()];
if (rawAttrMeta.isVerbatim()) {
extractedAttrs[0] = rawAttr;
} else if (null != rawAttrMeta.getRegEx()) {
matcher = rawAttrMeta.getPattern().matcher(rawAttr);
if (matcher.matches()) {
for (int i = 0; i < rawAttrMeta.getNumDerivedAttributes(); ++i) {
String extracted = matcher.group(i+1);
if(extracted != null) {
extractedAttrs[i] = extracted;
} else {
handleInvalidData();
break;
}
}
} else {
handleInvalidData();
}
} else if (null != rawAttrMeta.getSubSequenceIndexes()) {
if (rawAttrMeta.getSubSequenceIndexes().size() != rawAttrMeta.getNumDerivedAttributes()) {
throw new IllegalStateException("number subsequence indexes is not equal to number of extracted attributes");
}
int i = 0;
for (int[] indexes : rawAttrMeta.getSubSequenceIndexes()) {
if (indexes[0] < rawAttr.length() -1 && indexes[1] <= rawAttr.length()) {
String extracted = rawAttr.substring(indexes[0], indexes[1]);
extractedAttrs[i] = extracted;
} else {
handleInvalidData();
break;
}
++i;
}
} else {
throw new IllegalStateException("valid extraction strategy must be provided");
}
return extractedAttrs;
}
/**
*
*/
private void handleInvalidData() {
extractedAttrs = null;
if (failOnInvalid) {
throw new IllegalArgumentException("failed to extract from unstructured data");
}
}
}