/* * Sifarish: Recommendation Engine * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.sifarish.common; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.br.BrazilianAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.es.SpanishAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.it.ItalianAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; import org.apache.lucene.analysis.morfologik.MorfologikAnalyzer; import org.codehaus.jackson.map.ObjectMapper; import org.sifarish.feature.SingleTypeSchema; import org.sifarish.util.Field; import org.sifarish.util.FieldExtractor; import org.sifarish.util.Utility; /** * @author pranab * */ public class TextAnalyzer extends Configured implements Tool{ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Text analyzer MR"; job.setJobName(jobName); job.setJarByClass(TextAnalyzer.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(TextAnalyzer.AnalyzerMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); Utility.setConfiguration(job.getConfiguration()); int status = job.waitForCompletion(true) ? 0 : 1; return status; } /** * @author pranab * */ public static class AnalyzerMapper extends Mapper<LongWritable, Text, NullWritable, Text> { private Text valueHolder = new Text(); private String fieldDelim; private String fieldDelimRegex; private boolean consolidateFields; private Set<Integer> textFieldOrdinals = new HashSet<Integer>(); private Analyzer analyzer; private List<String> itemList = new ArrayList<String>(); private SingleTypeSchema schema; private Map<Integer, String> extrtactedFields = new HashMap<Integer, String>(); private Set<Integer> retainedFieldOrdinals = new HashSet<Integer>(); /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelim = config.get("field.delim", "[]"); fieldDelimRegex = config.get("field.delim.regex", "\\[\\]"); consolidateFields = config.getBoolean("tea.consolidate.field", false); String textFields = config.get("tea.text.field.ordinals", ""); String[] items = textFields.toString().split(","); for (int i = 0; i < items.length; ++i){ textFieldOrdinals.add(Integer.parseInt(items[i])); } //language specific analyzer String lang = config.get("tea.text.language", "en"); createAnalyzer(lang); //load schema String filePath = config.get("tea.raw.schema.file.path"); FileSystem dfs = FileSystem.get(config); Path src = new Path(filePath); FSDataInputStream fs = dfs.open(src); ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(fs, SingleTypeSchema.class); for (Field field : schema.getEntity().getFields()){ retainedFieldOrdinals.add(field.getOrdinal()); } } /** * creates language specific analyzers * @param lang */ private void createAnalyzer(String lang) { if (lang.equals("en")) { analyzer = new EnglishAnalyzer(Version.LUCENE_44); } else if (lang.equals("de")) { analyzer = new GermanAnalyzer(Version.LUCENE_44); } else if (lang.equals("es")) { analyzer = new SpanishAnalyzer(Version.LUCENE_44); } else if (lang.equals("fr")) { analyzer = new FrenchAnalyzer(Version.LUCENE_44); } else if (lang.equals("it")) { analyzer = new ItalianAnalyzer(Version.LUCENE_44); } else if (lang.equals("br")) { analyzer = new BrazilianAnalyzer(Version.LUCENE_44); } else if (lang.equals("ru")) { analyzer = new RussianAnalyzer(Version.LUCENE_44); } else if (lang.equals("pl")) { analyzer = new MorfologikAnalyzer(Version.LUCENE_44); } else { throw new IllegalArgumentException("unsupported language:" + lang); } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void cleanup(Context context) throws IOException, InterruptedException { } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); StringBuilder stBld = new StringBuilder(); StringBuilder consFields = new StringBuilder(); itemList.clear(); extrtactedFields.clear(); for (int i = 0;i < items.length; ++i) { String item = items[i]; if (textFieldOrdinals.contains(i)) { //extract fields findExtractedFields(i, item); //if text field analyze item = tokenize(item); if (consolidateFields){ consFields.append(item); item = null; } } if (null != item) { //only if retained if (retainedFieldOrdinals.contains(i)) { itemList.add(item); } } } //consolidated field at end if (consolidateFields) { itemList.add(consFields.toString()); } //add extracted fields for (int e = 0; e < extrtactedFields.size(); ++ e) { String extField = extrtactedFields.get(e); itemList.add(extField); } //build value string boolean first = true; for (String item : itemList){ if (first){ stBld.append(item); first = false; } else { stBld.append(fieldDelim).append(item); } } valueHolder.set(stBld.toString()); context.write(NullWritable.get(), valueHolder); } /** * @param ordinal * @param data */ private void findExtractedFields(int ordinal, String data) { List<FieldExtractor> extractors = schema.getEntity().getExtractorsForField(ordinal); for (FieldExtractor extractor : extractors) { String extField = extrtactedFields.get(extractor.getOrdinal()); if (null == extField || extField.isEmpty()) { String match = extractor. findMatch(data); if (null == match){ match = ""; } extrtactedFields.put(extractor.getOrdinal(), match); } } } /** * @param text * @return * @throws IOException */ private String tokenize(String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); StringBuilder stBld = new StringBuilder(); stream.reset(); CharTermAttribute termAttribute = (CharTermAttribute)stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String token = termAttribute.toString(); stBld.append(token).append(" "); } stream.end(); stream.close(); return stBld.toString(); } } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new TextAnalyzer(), args); System.exit(exitCode); } }