/*
* chombo: Hadoop Map Reduce utility
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.chombo.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.util.Version;
/**
* @author pranab
*
*/
public class TextTransformer {
private String[] excludeRegexes;
private String[] includeRegexes;
private Pattern[] includePatterns;
private boolean doAnalyze;
private Analyzer analyzer;
public TextTransformer() {
}
public TextTransformer withExcludeRegexes(String[] excludeRegexes) {
this.excludeRegexes = excludeRegexes;
return this;
}
public TextTransformer withIncludeRegexes(String[] includeRegexes) {
this.includeRegexes = includeRegexes;
includePatterns = new Pattern[includeRegexes.length];
int i = 0;
for (String regex : includeRegexes) {
includePatterns[i++] = Pattern.compile(regex);
}
return this;
}
/**
* @param doAnalyze
* @param lang
* @return
*/
public TextTransformer withDoAnalyze(boolean doAnalyze, String lang) {
this.doAnalyze = doAnalyze;
if (lang.equals("en")) {
analyzer = new EnglishAnalyzer(Version.LUCENE_44);
} else {
throw new IllegalArgumentException("unsupported language:" + lang);
}
return this;
}
/**
* @param record
* @return
* @throws IOException
*/
public String process(String record) throws IOException {
String processed = record;
if (null != excludeRegexes) {
for (String regex : excludeRegexes) {
processed = processed.replaceAll(regex, "");
}
} else if (null != includeRegexes) {
List<String> matchedTokens = new ArrayList<String>();
String[] items = record.split("\\s+");
for (String item : items) {
for (Pattern pattern : includePatterns) {
if (pattern.matcher(item).matches()) {
matchedTokens.add(item);
break;
}
}
}
processed = Utility.join(matchedTokens, " ");
}
if (doAnalyze) {
processed = Utility.analyze(processed, analyzer);
}
return processed;
}
}