/*
* Sifarish: Recommendation Engine
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.sifarish.etl;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.codehaus.jackson.map.ObjectMapper;
import org.sifarish.feature.SingleTypeSchema;
import org.sifarish.util.Field;
import org.sifarish.util.Utility;
/**
* Analyzer for structured text field e.g. street address, phone etc. Currently
* supports only US formats
* @author pranab
*
*/
public class StructuredTextAnalyzer extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
String jobName = "Structured Text analyzer MR";
job.setJobName(jobName);
job.setJarByClass(StructuredTextAnalyzer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(StructuredTextAnalyzer.AnalyzerMapper.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
Utility.setConfiguration(job.getConfiguration());
int status = job.waitForCompletion(true) ? 0 : 1;
return status;
}
/**
* @author pranab
*
*/
public static class AnalyzerMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
private Text valueHolder = new Text();
private String fieldDelim;
private String fieldDelimRegex;
private Analyzer analyzer;
private List<String> itemList = new ArrayList<String>();
private SingleTypeSchema schema;
private CountryStandardFormat countryFormat;
private StructuredTextNormalizer textNormalizer ;
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
Configuration config = context.getConfiguration();
fieldDelim = config.get("field.delim", "[]");
fieldDelimRegex = config.get("field.delim.regex", "\\[\\]");
//country specific format
String country = config.get("text.country", "United States");
countryFormat = CountryStandardFormat.createCountryStandardFormat(country, textNormalizer);
//language specific analyzer
String lang = config.get("text.language", "en");
createAnalyzer(lang);
//load schema
String filePath = config.get("raw.schema.file.path");
FileSystem dfs = FileSystem.get(config);
Path src = new Path(filePath);
FSDataInputStream fs = dfs.open(src);
ObjectMapper mapper = new ObjectMapper();
schema = mapper.readValue(fs, SingleTypeSchema.class);
}
/**
* creates language specific analyzers
* @param lang
*/
private void createAnalyzer(String lang) {
if (lang.equals("en")) {
analyzer = new EnglishAnalyzer(Version.LUCENE_44);
} else {
throw new IllegalArgumentException("unsupported language:" + lang);
}
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void cleanup(Context context) throws IOException, InterruptedException {
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] items = value.toString().split(fieldDelimRegex);
itemList.clear();
for (int i = 0;i < items.length; ++i) {
String item = items[i];
Field field = schema.getEntity().getFieldByOrdinal(i);
if (null != field && field.getDataType().equals(Field.DATA_TYPE_TEXT)) {
String format = field.getTextDataSubTypeFormat();
if (field.getDataSubType().equals(Field.TEXT_TYPE_PERSON_NAME)) {
item = countryFormat.personNameFormat(item);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS)) {
item = countryFormat.caseFormat(item, format);
item = countryFormat.streetAddressFormat(item);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_ONE)) {
item = countryFormat.caseFormat(item, format);
item = countryFormat.streetAddressOneFormat(item);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_TWO)) {
item = countryFormat.caseFormat(item, format);
item = countryFormat.streetAddressTwoFormat(item);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_CITY)) {
item = countryFormat.caseFormat(item, format);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_STATE)) {
item = countryFormat.stateFormat(item);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_ZIP)) {
item = countryFormat.caseFormat(item, format);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_COUNTRY)) {
item = countryFormat.caseFormat(item, format);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_EMAIL_ADDR)) {
item = countryFormat.emailFormat(item, format);
} else if (field.getDataSubType().equals(Field.TEXT_TYPE_PHONE_NUM)) {
item = countryFormat.phoneNumFormat(item, format);
} else {
//if text field analyze
item = tokenize(item);
}
}
itemList.add(item);
}
//build value string
valueHolder.set(org.chombo.util.Utility.join(itemList, fieldDelim));
context.write(NullWritable.get(), valueHolder);
}
/**
* @param text
* @return
* @throws IOException
*/
private String tokenize(String text) throws IOException {
TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
StringBuilder stBld = new StringBuilder();
stream.reset();
CharTermAttribute termAttribute = (CharTermAttribute)stream.getAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
String token = termAttribute.toString();
stBld.append(token).append(" ");
}
stream.end();
stream.close();
return stBld.toString();
}
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new StructuredTextAnalyzer(), args);
System.exit(exitCode);
}
}