/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer.lucene; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.nutch.analysis.AnalyzerFactory; import org.apache.nutch.analysis.NutchAnalyzer; import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.indexer.Indexer; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.NutchIndexWriter; import org.apache.nutch.indexer.NutchSimilarity; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.util.LogUtil; public class LuceneWriter implements NutchIndexWriter { public static enum STORE { YES, NO, COMPRESS } public static enum INDEX { NO, NO_NORMS, TOKENIZED, UNTOKENIZED } public static enum VECTOR { NO, OFFSET, POS, POS_OFFSET, YES } private IndexWriter writer; private AnalyzerFactory analyzerFactory; private Path perm; private Path temp; private FileSystem fs; private final Map<String, Field.Store> fieldStore; private final Map<String, Field.Index> fieldIndex; private final Map<String, Field.TermVector> fieldVector; public LuceneWriter() { fieldStore = new HashMap<String, Field.Store>(); fieldIndex = new HashMap<String, Field.Index>(); fieldVector = new HashMap<String, Field.TermVector>(); } private Document createLuceneDoc(NutchDocument doc) { final Document out = new Document(); out.setBoost(doc.getScore()); final Metadata documentMeta = doc.getDocumentMeta(); for (final Entry<String, List<String>> entry : doc) { final String fieldName = entry.getKey(); Field.Store store = fieldStore.get(fieldName); Field.Index index = fieldIndex.get(fieldName); Field.TermVector vector = fieldVector.get(fieldName); // default values if (store == null) { store = Field.Store.NO; } if (index == null) { index = Field.Index.NO; } if (vector == null) { vector = Field.TermVector.NO; } // read document-level field information final String[] fieldMetas = documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName); if (fieldMetas.length != 0) { for (final String val : fieldMetas) { if (LuceneConstants.STORE_YES.equals(val)) { store = Field.Store.YES; } else if (LuceneConstants.STORE_NO.equals(val)) { store = Field.Store.NO; } else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) { index = Field.Index.TOKENIZED; } else if (LuceneConstants.INDEX_NO.equals(val)) { index = Field.Index.NO; } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) { index = Field.Index.UN_TOKENIZED; } else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) { index = Field.Index.NO_NORMS; } else if (LuceneConstants.VECTOR_NO.equals(val)) { vector = Field.TermVector.NO; } else if (LuceneConstants.VECTOR_YES.equals(val)) { vector = Field.TermVector.YES; } else if (LuceneConstants.VECTOR_POS.equals(val)) { vector = Field.TermVector.WITH_POSITIONS; } else if (LuceneConstants.VECTOR_POS_OFFSET.equals(val)) { vector = Field.TermVector.WITH_POSITIONS_OFFSETS; } else if (LuceneConstants.VECTOR_OFFSET.equals(val)) { vector = Field.TermVector.WITH_OFFSETS; } } } for (final String fieldValue : entry.getValue()) { out.add(new Field(fieldName, fieldValue, store, index, vector)); } } return out; } @SuppressWarnings("unchecked") private void processOptions(Configuration conf) { final Iterator iterator = conf.iterator(); while (iterator.hasNext()) { final String key = (String) ((Map.Entry)iterator.next()).getKey(); if (!key.startsWith(LuceneConstants.LUCENE_PREFIX)) { continue; } if (key.startsWith(LuceneConstants.FIELD_STORE_PREFIX)) { final String field = key.substring(LuceneConstants.FIELD_STORE_PREFIX.length()); final LuceneWriter.STORE store = LuceneWriter.STORE.valueOf(conf.get(key)); switch (store) { case YES: fieldStore.put(field, Field.Store.YES); break; case NO: fieldStore.put(field, Field.Store.NO); break; case COMPRESS: fieldStore.put(field, Field.Store.COMPRESS); break; } } else if (key.startsWith(LuceneConstants.FIELD_INDEX_PREFIX)) { final String field = key.substring(LuceneConstants.FIELD_INDEX_PREFIX.length()); final LuceneWriter.INDEX index = LuceneWriter.INDEX.valueOf(conf.get(key)); switch (index) { case NO: fieldIndex.put(field, Field.Index.NO); break; case NO_NORMS: fieldIndex.put(field, Field.Index.NO_NORMS); break; case TOKENIZED: fieldIndex.put(field, Field.Index.TOKENIZED); break; case UNTOKENIZED: fieldIndex.put(field, Field.Index.UN_TOKENIZED); break; } } else if (key.startsWith(LuceneConstants.FIELD_VECTOR_PREFIX)) { final String field = key.substring(LuceneConstants.FIELD_VECTOR_PREFIX.length()); final LuceneWriter.VECTOR vector = LuceneWriter.VECTOR.valueOf(conf.get(key)); switch (vector) { case NO: fieldVector.put(field, Field.TermVector.NO); break; case OFFSET: fieldVector.put(field, Field.TermVector.WITH_OFFSETS); break; case POS: fieldVector.put(field, Field.TermVector.WITH_POSITIONS); break; case POS_OFFSET: fieldVector.put(field, Field.TermVector.WITH_POSITIONS_OFFSETS); break; case YES: fieldVector.put(field, Field.TermVector.YES); break; } } } } public void open(JobConf job, String name) throws IOException { this.fs = FileSystem.get(job); perm = new Path(FileOutputFormat.getOutputPath(job), name); temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); fs.delete(perm, true); // delete old, if any analyzerFactory = new AnalyzerFactory(job); writer = new IndexWriter(fs.startLocalOutput(perm, temp).toString(), new NutchDocumentAnalyzer(job), true); writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); writer.setMaxMergeDocs(job .getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG)); writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); processOptions(job); } public void close() throws IOException { writer.optimize(); writer.close(); fs.completeLocalOutput(perm, temp); // copy to dfs fs.createNewFile(new Path(perm, Indexer.DONE_NAME)); } public void write(NutchDocument doc) throws IOException { final Document luceneDoc = createLuceneDoc(doc); final NutchAnalyzer analyzer = analyzerFactory.get(luceneDoc.get("lang")); if (Indexer.LOG.isDebugEnabled()) { Indexer.LOG.debug("Indexing [" + luceneDoc.get("url") + "] with analyzer " + analyzer + " (" + luceneDoc.get("lang") + ")"); } writer.addDocument(luceneDoc, analyzer); } /** Adds a lucene field. * <p> * This method is provided for backward-compatibility with * older indexing filters. This should not be used by newer * implementations since this is slower than * {@link NutchDocument#add(String, String)} and will be removed * in a future release. * </p> * @param f Lucene field to be added. * @deprecated Use {@link NutchDocument#add(String, String)} instead and * set index-level metadata for field information. * */ @Deprecated public static void add(NutchDocument doc, Field f) { final String fieldName = f.name(); final String key = LuceneConstants.FIELD_PREFIX + fieldName; final Metadata documentMeta = doc.getDocumentMeta(); if (f.isStored()) { documentMeta.add(key, LuceneConstants.STORE_YES); } else if (f.isCompressed()) { documentMeta.add(key, LuceneConstants.STORE_COMPRESS); } else { documentMeta.add(key, LuceneConstants.STORE_NO); } if (f.isIndexed()) { if (f.isTokenized()) { documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED); } else if (f.getOmitNorms()) { documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS); } else { documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED); } } else { documentMeta.add(key, LuceneConstants.INDEX_NO); } if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) { documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET); } else if (f.isStoreOffsetWithTermVector()) { documentMeta.add(key, LuceneConstants.VECTOR_OFFSET); } else if (f.isStorePositionWithTermVector()) { documentMeta.add(key, LuceneConstants.VECTOR_POS); } else if (f.isTermVectorStored()) { documentMeta.add(key, LuceneConstants.VECTOR_YES); } else { documentMeta.add(key, LuceneConstants.VECTOR_NO); } } public static void addFieldOptions(String field, LuceneWriter.STORE store, LuceneWriter.INDEX index, LuceneWriter.VECTOR vector, Configuration conf) { conf.set(LuceneConstants.FIELD_STORE_PREFIX + field, store.toString()); conf.set(LuceneConstants.FIELD_INDEX_PREFIX + field, index.toString()); conf.set(LuceneConstants.FIELD_VECTOR_PREFIX + field, vector.toString()); } public static void addFieldOptions(String field, LuceneWriter.STORE store, LuceneWriter.INDEX index, Configuration conf) { LuceneWriter.addFieldOptions(field, store, index, LuceneWriter.VECTOR.NO, conf); } }