/** * This software is licensed to you under the Apache License, Version 2.0 (the * "Apache License"). * * LinkedIn's contributions are made under the Apache License. If you contribute * to the Software, the contributions will be deemed to have been made under the * Apache License, unless you expressly indicate otherwise. Please do not make any * contributions that would be inconsistent with the Apache License. * * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, this software * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache * License for the specific language governing permissions and limitations for the * software governed under the Apache License. * * © 2012 LinkedIn Corp. All Rights Reserved. */ package com.senseidb.indexing.hadoop.map; import java.io.File; import java.io.IOException; import java.lang.reflect.Constructor; import java.net.URL; import java.net.URLConnection; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.ReflectionUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.util.Version; import org.json.JSONException; import org.json.JSONObject; import proj.zoie.api.ZoieSegmentReader; import proj.zoie.api.indexing.AbstractZoieIndexable; import proj.zoie.api.indexing.ZoieIndexable; import proj.zoie.api.indexing.ZoieIndexable.IndexingReq; import com.senseidb.conf.SchemaConverter; import com.senseidb.conf.SenseiSchema; import com.senseidb.indexing.DefaultJsonSchemaInterpreter; import com.senseidb.indexing.JsonFilter; import com.senseidb.indexing.ShardingStrategy; import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm; import com.senseidb.indexing.hadoop.keyvalueformat.Shard; import com.senseidb.indexing.hadoop.util.SenseiJobConfig; public class SenseiMapper extends MapReduceBase implements Mapper<Object, Object, Shard, IntermediateForm> { private final static Logger logger = Logger.getLogger(SenseiMapper.class); private static DefaultJsonSchemaInterpreter _defaultInterpreter = null; private boolean _use_remote_schema = false; private volatile boolean _isConfigured = false; private Configuration _conf; private Shard[] _shards; private ShardingStrategy _shardingStategy; private MapInputConverter _converter; private static Analyzer analyzer; public void map(Object key, Object value, OutputCollector<Shard, IntermediateForm> output, Reporter reporter) throws IOException { if(_isConfigured == false) throw new IllegalStateException("Mapper's configure method wasn't sucessful. May not get the correct schema or Lucene Analyzer."); JSONObject json = null; try{ json = _converter.getJsonInput(key, value, _conf); json = _converter.doFilter(json); }catch(Exception e){ ExceptionUtils.printRootCauseStackTrace(e); throw new IllegalStateException("data conversion or filtering failed inside mapper. \n"); } if( _defaultInterpreter == null) reporter.incrCounter("Map", "Interpreter_null", 1); if( _defaultInterpreter != null && json != null && analyzer != null){ ZoieIndexable indexable = _defaultInterpreter.convertAndInterpret(json); IndexingReq[] idxReqs = indexable.buildIndexingReqs(); if(idxReqs.length>0){ Document doc = idxReqs[0].getDocument(); ZoieSegmentReader.fillDocumentID(doc, indexable.getUID()); if (indexable.isStorable()){ byte[] bytes = indexable.getStoreValue(); if (bytes!=null){ doc.add(new Field(AbstractZoieIndexable.DOCUMENT_STORE_FIELD,bytes)); } } //now we have uid and lucene Doc; IntermediateForm form = new IntermediateForm(); form.configure(_conf); form.process(doc, analyzer); form.closeWriter(); int chosenShard = -1; try { chosenShard = _shardingStategy.caculateShard(_shards.length, json); } catch (JSONException e) { throw new IOException("sharding dose not work for mapper."); } if (chosenShard >= 0) { // insert into one shard output.collect(_shards[chosenShard], form); } else { throw new IOException("Chosen shard for insert must be >= 0. current shard is: " + chosenShard); } } } } @Override public void configure(JobConf job) { super.configure(job); _conf = job; _shards = Shard.getIndexShards(_conf); _shardingStategy = (ShardingStrategy) ReflectionUtils.newInstance( job.getClass(SenseiJobConfig.DISTRIBUTION_POLICY, DummyShardingStrategy.class, ShardingStrategy.class), job); _converter = (MapInputConverter) ReflectionUtils.newInstance( job.getClass(SenseiJobConfig.MAPINPUT_CONVERTER, DummyMapInputConverter.class, MapInputConverter.class), job); try { setSchema(job); setAnalyzer(job); _isConfigured = true; } catch (Exception e) { e.printStackTrace(); _isConfigured = false; } } private void setAnalyzer(JobConf conf) throws Exception{ if(analyzer != null) return; String version = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER_VERSION); if(version == null) throw new IllegalStateException("version has not been specified"); String analyzerName = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER); if(analyzerName == null) throw new IllegalStateException("analyzer name has not been specified"); Class analyzerClass = Class.forName(analyzerName); Constructor constructor = analyzerClass.getConstructor(Version.class); analyzer = (Analyzer) constructor.newInstance((Version) Enum.valueOf((Class)Class.forName("org.apache.lucene.util.Version"), version)); } private void setSchema(JobConf conf) throws Exception { String _schema_uri = null; String metadataFileName = conf.get(SenseiJobConfig.SCHEMA_FILE_URL); Path[] localFiles = DistributedCache.getLocalCacheFiles(conf); if (localFiles != null) { for (int i = 0; i < localFiles.length; i++) { String strFileName = localFiles[i].toString(); if (strFileName.contains(conf.get(SenseiJobConfig.SCHEMA_FILE_URL))) { metadataFileName = strFileName; break; } } } if (metadataFileName != null && metadataFileName.length() > 0) { _schema_uri = "file:///" + metadataFileName; if (_defaultInterpreter == null) { logger.info("schema file is:" + _schema_uri); URL url = new URL(_schema_uri); URLConnection conn = url.openConnection(); conn.connect(); File xmlSchema = new File(url.toURI()); if (!xmlSchema.exists()) { throw new ConfigurationException( "schema not file"); } DocumentBuilderFactory dbf = DocumentBuilderFactory .newInstance(); dbf.setIgnoringComments(true); DocumentBuilder db = dbf.newDocumentBuilder(); org.w3c.dom.Document schemaXml = db .parse(xmlSchema); schemaXml.getDocumentElement().normalize(); JSONObject schemaData = SchemaConverter .convert(schemaXml); SenseiSchema schema = SenseiSchema.build(schemaData); _defaultInterpreter = new DefaultJsonSchemaInterpreter(schema); } } } }