package com.alimama.mdrill.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.alimama.mdrill.index.utils.DocumentMap;
import com.alimama.mdrill.index.utils.JobIndexPublic;
import com.alimama.mdrill.index.utils.PairWriteable;
import com.alimama.mdrill.index.utils.TdateFormat;
public class IndexMapper extends Mapper<WritableComparable, Text, PairWriteable, DocumentMap> {
private String[] fields = null;
private Boolean[] isDate;
private Boolean[] isString;
private Boolean[] isStore;
private String[][] contains;
private boolean containsfilter=false;
private String split="\001";
private boolean usedthedate=true;
private String thedate=null;
private Integer Index=(int) (Math.random()*10000);
private String uniqfield="";
private int uniqfieldIndex=-1;
private boolean isuniqcheck=false;
private int thedateIndex=-1;
@Override
public void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
TaskID taskId = context.getTaskAttemptID().getTaskID();
this.Index = taskId.getId();
System.out.println("###########>>>>"+this.Index);
Configuration conf = context.getConfiguration();
String mode=conf.get("mdrill.table.mode","");
HashMap<String,ArrayList<String>> contanis=new HashMap<String, ArrayList<String>>();
if(mode.indexOf("@fieldcontains:")>=0)
{
Pattern mapiPattern = Pattern.compile("@fieldcontains:([^@]+)@");
Matcher mat=mapiPattern.matcher(mode);
while (mat.find()) {
String matchStr= mat.group(1);
String[] kv=matchStr.split("&");
for(String s:kv)
{
String[] kvpair=s.split("=");
if(kvpair.length>=2){
ArrayList<String> list=contanis.get(kvpair[0]);
if(list==null)
{
list=new ArrayList<String>();
contanis.put(kvpair[0], list);
}
list.add( kvpair[1]);
}
}
}
}
containsfilter=contanis.size()>0;
String fieldStrs = conf.get("higo.index.fields");
this.uniqfield= conf.get("uniq.check.field");
if(this.uniqfield!=null&&this.uniqfield.length()>0)
{
this.isuniqcheck=true;
}
split=MakeIndex.parseSplit(conf.get("higo.column.split",split));
String custfields=conf.get("higo.column.custfields","");
usedthedate=conf.getBoolean("higo.column.userthedate", usedthedate);
this.thedate=null;
InputSplit inputSplit = context.getInputSplit();
Path filepath = ((FileSplit) inputSplit).getPath();
if(filepath!=null)
{
String pash=filepath.toString();
this.Index=pash.hashCode()%1000000;
if(this.Index<0)
{
this.Index*=-1;
}
}
if(usedthedate)
{
String inputbase = conf.get("higo.input.base");
this.thedate=JobIndexPublic.parseThedate(new Path(inputbase),filepath);
System.out.println("thedatepath: " + thedate+"@"+filepath.toString() +"@"+inputbase + "");
}
if(custfields==null||custfields.isEmpty())
{
String[] fieldslist = fieldStrs.split(",");
this.fields = new String[fieldslist.length];
this.isDate = new Boolean[fieldslist.length];
this.isString = new Boolean[fieldslist.length];
this.isStore = new Boolean[fieldslist.length];
this.contains=new String[fieldslist.length][];
for (int i = 0; i < fieldslist.length; i++) {
String[] fieldSchema = fieldslist[i].split(":");
String fieldName = fieldSchema[0].trim().toLowerCase();
String type = fieldSchema[1];
this.fields[i] = fieldName;
ArrayList<String> filterlist=contanis.get(fieldName);
if(filterlist==null)
{
this.contains[i]=null;
}else{
String[] filterarr=new String[filterlist.size()];
this.contains[i]=filterlist.toArray(filterarr);
}
if(this.fields[i].equals("thedate"))
{
thedateIndex=i;
}
if(this.isuniqcheck)
{
if(this.fields[i].equals(this.uniqfield))
{
uniqfieldIndex=i;
}
}
this.isStore[i] = Boolean.valueOf(fieldSchema[3]);
this.isDate[i] = type.equalsIgnoreCase("tdate");
this.isString[i] = type.equalsIgnoreCase("string");
}
}else{
String[] fieldslist = custfields.split(",");
this.fields = new String[fieldslist.length];
this.isDate = new Boolean[fieldslist.length];
this.isString = new Boolean[fieldslist.length];
this.isStore = new Boolean[fieldslist.length];
for (int i = 0; i < fieldslist.length; i++) {
this.isStore[i] = Boolean.valueOf(false);
this.fields[i] = fieldslist[i];
if(this.fields[i].equals("thedate"))
{
thedateIndex=i;
}
if(this.isuniqcheck)
{
if(this.fields[i].equals(this.uniqfield))
{
uniqfieldIndex=i;
}
}
this.isDate[i]= false;
this.isString[i] = true;
}
}
}
protected void cleanup(Context context) throws IOException,
InterruptedException {
}
private String parseDefault(String input,Context context)
{
if (input == null) {
return null;
}
input=input.trim();
if (input.isEmpty() || input.equals("\\N")|| input.equals("\\n")|| input.toLowerCase().equals("null")) {
context.getCounter("higo", "nullcolcount").increment(1);
return null;
}
if(input.length()>=512000)
{
context.getCounter("higo", "bigtextskip").increment(1);
return null;
}
return input;
}
private int debuglines=0;
private int printlines=0;
private boolean validate(String[] values,String record,Context context)
{
if(usedthedate)
{
if(values.length<2)
{
if(debuglines<100)
{
debuglines++;
System.out.println("miss columns values2: " + record.replaceAll(split, "#") + "");
}
context.getCounter("higo", "skiprecords").increment(1);
return false;
}
}else{
if(parseDefault(record,context)==null)
{
return false;
}
}
return true;
}
private boolean line(String record,Context context) throws IOException, InterruptedException
{
context.getCounter("higo", "totalrecord").increment(1);
String[] values = record.split(split,-1);
if(!this.validate(values, record, context))
{
return false;
}
String[] res =new String[fields.length];
for (int i = 0; i < fields.length; i++) {
String fieldName = fields[i];
String string =(i<values.length)?values[i]:null;
String val=parseDefault(string,context);
if (this.isDate[i]) {
res[i]=TdateFormat.ensureTdate(val, fieldName);
}else if(val!=null){
res[i]=val;
}else if(this.isString[i]){
res[i]="_";
}
}
if(usedthedate&&thedateIndex>=0)
{
if(thedate!=null)
{
res[thedateIndex]=thedate;
}
res[thedateIndex]=String.valueOf(res[thedateIndex]).replaceAll("-", "").replaceAll("_", "");
if(res[thedateIndex]==null||res[thedateIndex].length()!=8)
{
if(debuglines<100)
{
debuglines++;
System.out.println("miss thedate values: " + record.replaceAll(split, "#") + "");
}
context.getCounter("higo", "skiprecords").increment(1);
}
context.getCounter("higo", "dayrecord_"+String.valueOf(res[thedateIndex])).increment(1);
}
if(printlines<10)
{
printlines++;
System.out.println("res:"+Arrays.toString(values));
}
if(this.containsfilter)
{
int maxlen=res.length;
for(int i=0;i<this.contains.length;i++)
{
String[] containslist=this.contains[i];
if(containslist!=null)
{
if(i>=maxlen)
{
context.getCounter("higo", "skiprecords_filter").increment(1);
return true;
}
String val=res[i];
for(String s:containslist)
{
if(val.indexOf(s)<0)
{
context.getCounter("higo", "skiprecords_filter").increment(1);
return true;
}
}
}
}
}
context.write(new PairWriteable(this.Index++), new DocumentMap(res));
if(this.isuniqcheck&&uniqfieldIndex>0&&res[uniqfieldIndex]!=null)
{
String notempty=res[uniqfieldIndex];
if(notempty.length()>0&&!notempty.equals("_"))
{
context.write(new PairWriteable(new Text("uniq_"+notempty)), new DocumentMap());
}
}
return true;
}
@Override
public void map(WritableComparable key, Text value, Context context)
throws IOException, InterruptedException {
String[] records = value.toString().split("[\n]+");
for(String record:records)
{
this.line(record, context);
}
}
}