package com.alimama.mdrill.index; import java.io.IOException; import java.io.StringReader; import java.util.Iterator; import javax.xml.parsers.ParserConfigurationException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.TaskID; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.TermInfosWriter; import org.apache.solr.core.SolrConfig; import org.apache.solr.schema.IndexSchema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.alimama.mdrill.index.utils.DocumentConverter; import com.alimama.mdrill.index.utils.DocumentList; import com.alimama.mdrill.index.utils.DocumentMap; import com.alimama.mdrill.index.utils.HeartBeater; import com.alimama.mdrill.index.utils.JobIndexPublic; import com.alimama.mdrill.index.utils.PairWriteable; import com.alimama.mdrill.index.utils.RamWriter; import com.alimama.mdrill.index.utils.ShardWriter; public class IndexReducer extends Reducer<PairWriteable, DocumentMap, IntWritable, Text> { public static Logger LOG = LoggerFactory.getLogger(IndexReducer.class); private HeartBeater heartBeater = null; private ShardWriter shardWriter = null; private String tmpath = null; private String localtmpath = null; private String indexHdfsPath = null; private Analyzer analyzer; private DocumentConverter documentConverter = null; DocumentList doclistcache=new DocumentList(); RamWriter ramMerger=null; boolean isNotFdtMode=true; private String[] fields = null; private int Index=0; protected void setup(Context context) throws java.io.IOException, InterruptedException { super.setup(context); debugInfo=0; doccount=0; TaskID taskId = context.getTaskAttemptID().getTaskID(); this.Index = taskId.getId(); context.getCounter("higo", "dumpcount").increment(0); Configuration conf = context.getConfiguration(); isNotFdtMode=conf.get("mdrill.table.mode","").indexOf("@hdfs@")<0; String fieldStrs = context.getConfiguration().get("higo.index.fields"); String custfields=context.getConfiguration().get("higo.column.custfields",""); if(custfields==null||custfields.isEmpty()) { String[] fieldslist = fieldStrs.split(","); this.fields = new String[fieldslist.length]; for (int i = 0; i < fieldslist.length; i++) { String[] fieldSchema = fieldslist[i].split(":"); String fieldName = fieldSchema[0].trim().toLowerCase(); this.fields[i] = fieldName; } }else{ String[] fieldslist = custfields.split(","); this.fields = new String[fieldslist.length]; for (int i = 0; i < fieldslist.length; i++) { this.fields[i] = fieldslist[i]; } } TermInfosWriter.setNotUseQuick(true); if(!isNotFdtMode) { TermInfosWriter.setSkipInterVal(16); } heartBeater = new HeartBeater(context); heartBeater.needHeartBeat(); this.doclistcache=new DocumentList(); this.ramMerger = new RamWriter(); String[] fieldslist = fieldStrs.split(","); this.documentConverter = new DocumentConverter(fieldslist,"solrconfig.xml", "schema.xml"); shardWriter = this.initShardWriter(context); LOG.info("end initShardWriter"); try { this.analyzer = this.documentConverter .getAnalyzer();JobIndexPublic.setAnalyzer(conf); } catch (Exception e) { throw new IOException(e); } LOG.info("end set up"); } public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException, ClassNotFoundException { IndexSchema schema=null; SolrConfig solrConfig = new SolrConfig("E:\\一淘svn\\higo\\trunk\\adhoc-core\\solr\\conf\\solrconfig.xml"); InputSource is = new InputSource(solrConfig.getResourceLoader().openSchema("E:\\一淘svn\\higo\\trunk\\adhoc-core\\solr\\conf\\schema.xml")); schema =new IndexSchema(solrConfig, "solrconfig",is); String s = "Ab94aa4CdDbd34dfde082ed1b4c4d0c505b69"; StringReader sr = new StringReader(s); // Analyzer analyzer =new StandardAnalyzer((Version) Enum.valueOf((Class) Class.forName("org.apache.lucene.util.Version"), Version.LUCENE_35.name())); Analyzer analyzer = schema.getAnalyzer();//JobIndexPublic.setAnalyzer(conf); TokenStream tk=analyzer.tokenStream("rawquery", sr); boolean hasnext = tk.incrementToken(); while(hasnext){ TermAttribute ta = tk.getAttribute(TermAttribute.class); System.out.println(ta.term()); hasnext = tk.incrementToken(); } } protected void cleanup(Context context) throws IOException, InterruptedException { try { LOG.info("begin clean up"); RamWriter ram = doclistcache.toRamWriter(documentConverter, analyzer,context); ramMerger.process(ram); if (this.maybeFlush(ramMerger,context,true)) { ramMerger = null; } doclistcache=new DocumentList(); shardWriter.optimize(); shardWriter.close(); Configuration conf = context.getConfiguration(); FileSystem fs = FileSystem.get(conf); fs.copyFromLocalFile(new Path(localtmpath), new Path(tmpath)); if (!fs.exists(new Path(indexHdfsPath))) { fs.rename(new Path(tmpath), new Path(indexHdfsPath)); } if (shardWriter.getNumDocs() > 0 && lastkey != null) { TaskID taskId = context.getTaskAttemptID().getTaskID(); int partition = taskId.getId(); System.out.println("###########>>>>"+partition); context.write(new IntWritable(partition),new Text(indexHdfsPath)); } FileSystem lfs = FileSystem.getLocal(conf); if(lfs.exists(new Path(localtmpath))) { lfs.delete(new Path(localtmpath),true); } } catch (Throwable e) { LOG.error("cleanup",e); throw new IOException(e); } heartBeater.cancelHeartBeat(); heartBeater.interrupt(); } private ShardWriter initShardWriter(Context context) throws IOException { String part_xxxxx = JobIndexPublic.getOutFileName(context, "part"); Configuration conf = context.getConfiguration(); FileSystem lfs = FileSystem.getLocal(conf); String outputdir = conf.get("mapred.output.dir"); indexHdfsPath = new Path(outputdir, part_xxxxx).toString(); tmpath = new Path(outputdir + "/_tmpindex", part_xxxxx + "_" + java.util.UUID.randomUUID().toString()).toString(); localtmpath = new Path("./_tmpindex", part_xxxxx + "_" + java.util.UUID.randomUUID().toString()).toString(); ShardWriter shardWriter = new ShardWriter(lfs, localtmpath, conf); return shardWriter; } private IntWritable lastkey = null; private int debuglines=0; long debugInfo=0; protected void reduce(PairWriteable key, Iterable<DocumentMap> values, Context context) throws java.io.IOException, InterruptedException { if(debugInfo%10000==0) { LOG.info("debugInfo:"+debugInfo); if(debugInfo>maxDocCount_l) { LOG.info("debugInfo>maxDocCount_l:"+debugInfo); return ; } } debugInfo++; if(!key.isNum()) { int dumps=0; Iterator<DocumentMap> iterator = values.iterator(); while (iterator.hasNext()) { DocumentMap doclist = iterator.next(); dumps++; } if(dumps>1) { context.getCounter("higo", "dumpcount").increment(1);; if(debuglines<100) { debuglines++; System.out.println("dumpcount: " + key.toString() + ""); } } return ; } lastkey = new IntWritable(key.getIndex()); Iterator<DocumentMap> iterator = values.iterator(); while (iterator.hasNext()) { if(doccount>maxDocCount||debugInfo>maxDocCount_l) { LOG.info("count over:"+debugInfo); break ; } DocumentMap map=iterator.next(); int addcnt=doclistcache.add(map,this.fields); if(addcnt<=0) { context.getCounter("higo", "addempty").increment(1); continue; } if(!doclistcache.isoversize()) { continue; } RamWriter ram = doclistcache.toRamWriter(documentConverter, analyzer,context); doclistcache=new DocumentList(); ramMerger.process(ram); if (this.maybeFlush(ramMerger,context,false)) { ramMerger = new RamWriter();; } } } private long minsize=1024l*1024*32; private static long maxDocCount=10000l*1000*10; private static long maxDocCount_l=10000l*1000*100; long doccount=0; private boolean maybeFlush(RamWriter form, Context context,boolean fource) throws IOException { if (form == null) { return false; } Integer docs=form.getNumDocs(); if(docs<=0) { return false; } if ((docs>=1000&&form.totalSizeInBytes()>minsize)||fource||docs>=10000) { try{ context.getCounter("higo", "docCount").increment(docs);; doccount+=docs; form.closeWriter(); shardWriter.process(form); form.closeDir(); }catch(Throwable e) { LOG.error("maybeFlush error",e); throw new IOException(e); } return true; } return false; } }