SenseiMapper.java example

Explorer
sensei-master
/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 * © 2012 LinkedIn Corp. All Rights Reserved.  
 */
package com.senseidb.indexing.hadoop.map;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.URL;
import java.net.URLConnection;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.util.Version;
import org.json.JSONException;
import org.json.JSONObject;

import proj.zoie.api.ZoieSegmentReader;
import proj.zoie.api.indexing.AbstractZoieIndexable;
import proj.zoie.api.indexing.ZoieIndexable;
import proj.zoie.api.indexing.ZoieIndexable.IndexingReq;

import com.senseidb.conf.SchemaConverter;
import com.senseidb.conf.SenseiSchema;
import com.senseidb.indexing.DefaultJsonSchemaInterpreter;
import com.senseidb.indexing.JsonFilter;
import com.senseidb.indexing.ShardingStrategy;
import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm;
import com.senseidb.indexing.hadoop.keyvalueformat.Shard;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;

public class SenseiMapper extends MapReduceBase implements Mapper<Object, Object, Shard, IntermediateForm> {

	private final static Logger logger = Logger.getLogger(SenseiMapper.class);
	private static DefaultJsonSchemaInterpreter _defaultInterpreter = null;
	private boolean _use_remote_schema = false;
	private volatile boolean _isConfigured = false;
	private Configuration _conf;
	private Shard[] _shards;
	
	private ShardingStrategy _shardingStategy;
	private MapInputConverter _converter;
	
	private static Analyzer analyzer;
	

	  
    
    public void map(Object key, Object value, 
                    OutputCollector<Shard, IntermediateForm> output, 
                    Reporter reporter) throws IOException {
    	
        if(_isConfigured == false)
	      throw new IllegalStateException("Mapper's configure method wasn't sucessful. May not get the correct schema or Lucene Analyzer.");

        JSONObject json = null;
    	try{
    		json = _converter.getJsonInput(key, value, _conf);
    		json = _converter.doFilter(json);
    	}catch(Exception e){
    		ExceptionUtils.printRootCauseStackTrace(e);
    		throw new IllegalStateException("data conversion or filtering failed inside mapper. \n");
    	}
    	
    	
    	if( _defaultInterpreter == null)
    		reporter.incrCounter("Map", "Interpreter_null", 1);
    	
    	if(  _defaultInterpreter != null && json != null && analyzer != null){

    	      ZoieIndexable indexable = _defaultInterpreter.convertAndInterpret(json);
    	      
    	      IndexingReq[] idxReqs = indexable.buildIndexingReqs();
    	      if(idxReqs.length>0){
    	    	  Document doc = idxReqs[0].getDocument();
    		  	  ZoieSegmentReader.fillDocumentID(doc, indexable.getUID());
    	    	  
                  if (indexable.isStorable()){
                    byte[] bytes = indexable.getStoreValue();
                    if (bytes!=null){
                      doc.add(new Field(AbstractZoieIndexable.DOCUMENT_STORE_FIELD,bytes));
                    }
                  }
    		  	  
    		  	  //now we have uid and lucene Doc;
		          IntermediateForm form = new IntermediateForm();
		          form.configure(_conf);
		          form.process(doc, analyzer);
		          form.closeWriter();

		          int chosenShard = -1;
				  try {
					  chosenShard = _shardingStategy.caculateShard(_shards.length, json);
				  } catch (JSONException e) {
					  throw new IOException("sharding dose not work for mapper.");
				  }
		          if (chosenShard >= 0) {
		            // insert into one shard
		            output.collect(_shards[chosenShard], form);
		          } else {
		            throw new IOException("Chosen shard for insert must be >= 0. current shard is: " + chosenShard);
		          }
    	      }
    	}
        
    }
    

	@Override
	public void configure(JobConf job) {
		super.configure(job);
		_conf = job;
	    _shards = Shard.getIndexShards(_conf);
		
		_shardingStategy =
		        (ShardingStrategy) ReflectionUtils.newInstance(
				job.getClass(SenseiJobConfig.DISTRIBUTION_POLICY,
				DummyShardingStrategy.class, ShardingStrategy.class), job);
		
		_converter = (MapInputConverter) ReflectionUtils.newInstance(
				job.getClass(SenseiJobConfig.MAPINPUT_CONVERTER,
						DummyMapInputConverter.class, MapInputConverter.class), job);
		
		try {
			setSchema(job);
			setAnalyzer(job);		   	 
			
			_isConfigured = true;
		} catch (Exception e) {
			e.printStackTrace();
			_isConfigured = false;
		}
    }
	
	private void setAnalyzer(JobConf conf) throws Exception{
		
		if(analyzer != null)
			return;
		
		String version = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER_VERSION);
		if(version == null)
			 throw new IllegalStateException("version has not been specified");
		
		String analyzerName = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER);
		if(analyzerName == null)
			 throw new IllegalStateException("analyzer name has not been specified");
		
		Class analyzerClass = Class.forName(analyzerName);
		Constructor constructor = analyzerClass.getConstructor(Version.class);
		analyzer = (Analyzer) constructor.newInstance((Version) Enum.valueOf((Class)Class.forName("org.apache.lucene.util.Version"), version));

	}

	private void setSchema(JobConf conf) throws Exception {

		String _schema_uri = null;
		String metadataFileName = conf.get(SenseiJobConfig.SCHEMA_FILE_URL);
		
		Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
		if (localFiles != null) {
		  for (int i = 0; i < localFiles.length; i++) {
			  String strFileName = localFiles[i].toString();
			  if (strFileName.contains(conf.get(SenseiJobConfig.SCHEMA_FILE_URL))) {
				  metadataFileName = strFileName;
				  break;
			  }
		  }
		}
		
		if (metadataFileName != null && metadataFileName.length() > 0) {
			_schema_uri = "file:///" + metadataFileName;

			if (_defaultInterpreter == null) {
				
				logger.info("schema file is:" + _schema_uri);
				URL url = new URL(_schema_uri);
				URLConnection conn = url.openConnection();
				conn.connect();

				File xmlSchema = new File(url.toURI());
				if (!xmlSchema.exists()) {
					throw new ConfigurationException(
							"schema not file");
				}
				DocumentBuilderFactory dbf = DocumentBuilderFactory
						.newInstance();
				dbf.setIgnoringComments(true);
				DocumentBuilder db = dbf.newDocumentBuilder();
				org.w3c.dom.Document schemaXml = db
						.parse(xmlSchema);
				schemaXml.getDocumentElement().normalize();
				JSONObject schemaData = SchemaConverter
						.convert(schemaXml);

				SenseiSchema schema = SenseiSchema.build(schemaData);
				_defaultInterpreter = new DefaultJsonSchemaInterpreter(schema);
			}
		}
	}

}