/**
* Copyright (c) 2008--2015 Red Hat, Inc.
*
* This software is licensed to you under the GNU General Public License,
* version 2 (GPLv2). There is NO WARRANTY for this software, express or
* implied, including the implied warranties of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. You should have received a copy of GPLv2
* along with this software; if not, see
* http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
*
* Red Hat trademarks are not licensed under GPLv2. No permission is
* granted to use or replicate Red Hat trademarks that are incorporated
* in this software or its documentation.
*/
package com.redhat.satellite.search.index;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.nutch.analysis.AnalyzerFactory;
import org.apache.nutch.searcher.FetchedSegments;
import org.apache.nutch.searcher.HitDetails;
import org.apache.nutch.searcher.Summary;
import org.apache.nutch.util.NutchConfiguration;
import com.redhat.satellite.search.config.Configuration;
import com.redhat.satellite.search.index.builder.BuilderFactory;
import com.redhat.satellite.search.index.ngram.NGramAnalyzer;
import com.redhat.satellite.search.index.ngram.NGramQueryParser;
import com.redhat.satellite.search.rpc.handlers.IndexHandler;
/**
* Indexing workhorse class
*
* @version $Rev$
*/
public class IndexManager {
private static Logger log = Logger.getLogger(IndexManager.class);
private String indexWorkDir;
private int maxHits;
private double score_threshold;
private double system_score_threshold;
private double errata_score_threshold;
private double errata_advisory_score_threshold;
private int min_ngram;
private int max_ngram;
private boolean filterDocResults = false;
private boolean explainResults = false;
private AnalyzerFactory nutchAnalyzerFactory;
// Name conflict with our Configuration class and Hadoop's
private org.apache.hadoop.conf.Configuration nutchConf;
private Map<String, String> docLocaleLookUp = new TreeMap<String, String>
(String.CASE_INSENSITIVE_ORDER);
private Map<String, FetchedSegments> docSegments;
/**
* Constructor
*
* @param config application config
*/
public IndexManager(Configuration config) {
maxHits = config.getInt("search.max_hits_returned", 0);
indexWorkDir = config.getString("search.index_work_dir", null);
if (indexWorkDir == null) {
throw new IllegalArgumentException(
"search.index_work_dir config entry " + "is missing");
}
if (!indexWorkDir.endsWith("/")) {
indexWorkDir += "/";
}
score_threshold = config.getDouble("search.score_threshold", .30);
system_score_threshold = config.getDouble("search.system_score_threshold", .30);
errata_score_threshold = config.getDouble("search.errata_score_threshold", .30);
errata_advisory_score_threshold =
config.getDouble("search.errata.advisory_score_threshold", .30);
min_ngram = config.getInt("search.min_ngram", 1);
max_ngram = config.getInt("search.max_ngram", 5);
initDocLocaleLookup();
filterDocResults = config.getBoolean("search.doc.limit_results");
explainResults = config.getBoolean("search.log.explain.results");
initDocSummary();
}
/**
* @return String of the index working directory
*/
public String getIndexWorkDir() {
return indexWorkDir;
}
/**
* Query a index
*
* @param indexName name of the index
* @param query search query
* @param lang language
* @return list of hits
* @throws IndexingException if there is a problem indexing the content.
* @throws QueryParseException
*/
public List<Result> search(String indexName, String query, String lang)
throws IndexingException, QueryParseException {
return search(indexName, query, lang, false);
}
/**
* Query a index
*
* @param indexName name of the index
* @param query search query
* @param lang language
* @param isFineGrained
* true: will limit results, less are returned but they are closer
* to the search query, useful for advanced/free form queries
*
* false: will allow queries to be more flexible returning words
* which are spelled similarly
*
* @return list of hits
* @throws IndexingException if there is a problem indexing the content.
* @throws QueryParseException
*/
public List<Result> search(String indexName, String query, String lang,
boolean isFineGrained)
throws IndexingException, QueryParseException {
IndexSearcher searcher = null;
IndexReader reader = null;
List<Result> retval = null;
try {
reader = getIndexReader(indexName, lang);
searcher = getIndexSearcher(indexName, lang);
QueryParser qp = getQueryParser(indexName, lang, isFineGrained);
Query q = qp.parse(query);
if (log.isDebugEnabled()) {
log.debug("Original query was: " + query);
log.debug("Parsed Query is: " + q.toString());
}
Hits hits = searcher.search(q);
if (log.isDebugEnabled()) {
log.debug(hits.length() + " results were found.");
}
Set<Term> queryTerms = null;
try {
queryTerms = new HashSet<Term>();
Query newQ = q.rewrite(reader);
newQ.extractTerms(queryTerms);
}
catch (Exception e) {
e.printStackTrace();
throw new QueryParseException(e);
}
retval = processHits(indexName, hits, queryTerms, query, lang);
if (explainResults) {
debugExplainResults(indexName, hits, searcher, q, queryTerms);
}
}
catch (IOException e) {
// this exception is thrown, when there're no packages or errata on the system
// and the user performs a search
// if this is the case, just return 0 results, otherwise rethrow the exception
if (!e.getMessage().contains("no segments* file found in org.apache.lucene.store.FSDirectory@/var/lib/rhn/search/indexes")) {
throw new IndexingException(e);
}
log.error(e.getMessage());
retval = new ArrayList<Result>();
}
catch (ParseException e) {
throw new QueryParseException("Could not parse query: '" + query + "'");
}
finally {
try {
if (searcher != null) {
searcher.close();
}
if (reader != null) {
reader.close();
}
}
catch (IOException ex) {
throw new IndexingException(ex);
}
}
return retval;
}
/**
* Create an empty index if it exists
*
* @param indexName index to use
* @param lang language.
* @throws IndexingException something went wrong adding the document
*/
public void createIndex(String indexName, String lang)
throws IndexingException {
try {
IndexWriter writer = getIndexWriter(indexName, lang);
try {
writer.flush();
}
finally {
try {
writer.close();
}
finally {
// unlock it if it is locked.
unlockIndex(indexName);
}
}
}
catch (CorruptIndexException e) {
throw new IndexingException(e);
}
catch (LockObtainFailedException e) {
throw new IndexingException(e);
}
catch (IOException e) {
throw new IndexingException(e);
}
}
/**
* Adds a document to an index
*
* @param indexName index to use
* @param doc Document to be indexed.
* @param lang language.
* @throws IndexingException something went wrong adding the document
*/
public void addToIndex(String indexName, Document doc, String lang)
throws IndexingException {
try {
IndexWriter writer = getIndexWriter(indexName, lang);
try {
writer.addDocument(doc);
writer.flush();
}
finally {
try {
writer.close();
}
finally {
// unlock it if it is locked.
unlockIndex(indexName);
}
}
}
catch (CorruptIndexException e) {
throw new IndexingException(e);
}
catch (LockObtainFailedException e) {
throw new IndexingException(e);
}
catch (IOException e) {
throw new IndexingException(e);
}
}
/**
* @param indexName
* @param doc document with data to index
* @param uniqueField field in doc which identifies this uniquely
* @param lang language
* @throws IndexingException
*/
public void addUniqueToIndex(String indexName, Document doc,
String uniqueField, String lang)
throws IndexingException {
IndexReader reader = null;
int numFound = 0;
try {
reader = getIndexReader(indexName, lang);
Term term = new Term(uniqueField, doc.get(uniqueField));
numFound = reader.docFreq(term);
}
catch (FileNotFoundException e) {
// Index doesn't exist, so this add will be unique
// we don't need to do anything/
}
catch (IOException e) {
throw new IndexingException(e);
}
finally {
if (reader != null) {
try {
reader.close();
}
catch (IOException e) {
//
}
}
}
if (numFound > 0) {
log.info("Found " + numFound + " <" + indexName + " docs for " +
uniqueField + ":" + doc.get(uniqueField) +
" will remove them now.");
removeFromIndex(indexName, uniqueField, doc.get(uniqueField));
}
addToIndex(indexName, doc, lang);
}
/**
* Remove a document from an index
*
* @param indexName index to use
* @param uniqueField field name which represents this data's unique id
* @param objectId unique document id
* @throws IndexingException something went wrong removing the document
*/
public void removeFromIndex(String indexName, String uniqueField, String objectId)
throws IndexingException {
log.info("Removing <" + indexName + "> " + uniqueField + ":" +
objectId);
Term t = new Term(uniqueField, objectId);
IndexReader reader;
try {
reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG);
try {
reader.deleteDocuments(t);
reader.flush();
}
finally {
if (reader != null) {
reader.close();
}
}
}
catch (CorruptIndexException e) {
throw new IndexingException(e);
}
catch (IOException e) {
throw new IndexingException(e);
}
}
/**
* Unlocks the index at the given directory if it is currently locked.
* Otherwise, does nothing.
* @param indexName index name
* @throws IOException thrown if there is a problem unlocking the index.
*/
private void unlockIndex(String indexName) throws IOException {
String path = indexWorkDir + indexName;
File f = new File(path);
Directory dir = FSDirectory.getDirectory(f);
if (IndexReader.isLocked(dir)) {
IndexReader.unlock(dir);
}
}
private IndexWriter getIndexWriter(String name, String lang)
throws CorruptIndexException, LockObtainFailedException,
IOException {
String path = indexWorkDir + name;
File f = new File(path);
f.mkdirs();
Analyzer analyzer = getAnalyzer(name, lang);
IndexWriter writer = new IndexWriter(path, analyzer);
writer.setUseCompoundFile(true);
return writer;
}
private IndexReader getIndexReader(String indexName, String locale)
throws CorruptIndexException, IOException {
String path = "";
if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
path = indexWorkDir + File.separator +
getDocIndexPath(locale);
}
else {
path = indexWorkDir + indexName;
}
log.info("IndexManager::getIndexReader(" + indexName + ", " + locale +
") path = " + path);
File f = new File(path);
IndexReader retval = IndexReader.open(FSDirectory.getDirectory(f));
return retval;
}
private IndexSearcher getIndexSearcher(String indexName, String locale)
throws CorruptIndexException, IOException {
String path = "";
if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
path = indexWorkDir + File.separator +
getDocIndexPath(locale);
}
else {
path = indexWorkDir + indexName;
}
log.info("IndexManager::getIndexSearcher(" + indexName + ", " + locale +
") path = " + path);
IndexSearcher retval = new IndexSearcher(path);
return retval;
}
private QueryParser getQueryParser(String indexName, String lang,
boolean isFineGrained) {
if (log.isDebugEnabled()) {
log.debug("getQueryParser(" + indexName + ", " + lang + ", " +
isFineGrained + ")");
}
QueryParser qp;
Analyzer analyzer = getAnalyzer(indexName, lang);
if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
qp = new QueryParser("content", analyzer);
}
else {
qp = new NGramQueryParser("name", analyzer, isFineGrained);
}
qp.setDateResolution(DateTools.Resolution.MINUTE);
return qp;
}
private Analyzer getAnalyzer(String indexName, String lang) {
if (log.isDebugEnabled()) {
log.debug("getAnalyzer(" + indexName + ", " + lang + ")");
}
if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
return getDocAnalyzer(lang);
}
else if (indexName.compareTo(BuilderFactory.SERVER_TYPE) == 0) {
return getServerAnalyzer();
}
else if (indexName.compareTo(BuilderFactory.ERRATA_TYPE) == 0) {
return getErrataAnalyzer();
}
else if (indexName.compareTo(BuilderFactory.SNAPSHOT_TAG_TYPE) == 0) {
return getSnapshotTagAnalyzer();
}
else if (indexName.compareTo(BuilderFactory.HARDWARE_DEVICE_TYPE) == 0) {
return getHardwareDeviceAnalyzer();
}
else if (indexName.compareTo(BuilderFactory.SERVER_CUSTOM_INFO_TYPE) == 0) {
return getServerCustomInfoAnalyzer();
}
else {
log.debug(indexName + " using getDefaultAnalyzer()");
return getDefaultAnalyzer();
}
}
private List<Result> processHits(String indexName, Hits hits, Set<Term> queryTerms,
String query, String lang)
throws IOException {
List<Result> retval = new ArrayList<Result>();
for (int x = 0; x < hits.length(); x++) {
Document doc = hits.doc(x);
Result pr = null;
if (!isScoreAcceptable(indexName, hits, x, query)) {
break;
}
if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
pr = new DocResult(x, hits.score(x), doc);
String summary = lookupDocSummary(doc, query, lang);
if (summary != null) {
((DocResult)pr).setSummary(summary);
}
}
else if (indexName.compareTo(BuilderFactory.HARDWARE_DEVICE_TYPE) == 0) {
pr = new HardwareDeviceResult(x, hits.score(x), doc);
}
else if (indexName.compareTo(BuilderFactory.SNAPSHOT_TAG_TYPE) == 0) {
pr = new SnapshotTagResult(x, hits.score(x), doc);
}
else if (indexName.compareTo(BuilderFactory.SERVER_CUSTOM_INFO_TYPE) == 0) {
pr = new ServerCustomInfoResult(x, hits.score(x), doc);
}
else if (indexName.compareTo(BuilderFactory.XCCDF_IDENT_TYPE) == 0) {
pr = new Result(x,
doc.getField("id").stringValue(),
doc.getField("identifier").stringValue(),
hits.score(x));
}
else {
pr = new Result(x,
doc.getField("id").stringValue(),
doc.getField("name").stringValue(),
hits.score(x));
}
if (log.isDebugEnabled()) {
log.debug("Hit[" + x + "] Score = " + hits.score(x) + ", Result = " + pr);
}
/**
* matchingField will help the webUI to understand what field was responsible
* for this match. Later implementation should use "Explanation" to determine
* field, for now we will simply grab one term and return it's field.
*/
try {
MatchingField match = new MatchingField(query, doc, queryTerms);
pr.setMatchingField(match.getFieldName());
pr.setMatchingFieldValue(match.getFieldValue());
log.info("hit[" + x + "] matchingField is being set to: <" +
pr.getMatchingField() + "> based on passed in query field. " +
"matchingFieldValue = " + pr.getMatchingFieldValue());
}
catch (Exception e) {
log.error("Caught exception: ", e);
}
if (pr != null) {
retval.add(pr);
}
if (maxHits > 0 && x == maxHits) {
break;
}
}
return retval;
}
/**
*
* @param indexName
* @param hits
* @param x
* @param query
* @return true - score is acceptable
* false - score is NOT acceptable
* @throws IOException
*/
private boolean isScoreAcceptable(String indexName, Hits hits, int x, String queryIn)
throws IOException {
String guessMainQueryTerm = MatchingField.getFirstFieldName(queryIn);
if ((indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) &&
(!filterDocResults)) {
return true;
}
/**
* Dropping matches which are a poor fit.
* system searches are filtered based on "system_score_threshold"
* other searches will return 10 best matches, then filter anything below
* "score_threshold"
*/
if ((indexName.compareTo(BuilderFactory.SERVER_TYPE) == 0) ||
(indexName.compareTo(BuilderFactory.SERVER_CUSTOM_INFO_TYPE) == 0) ||
(indexName.compareTo(BuilderFactory.SNAPSHOT_TAG_TYPE) == 0) ||
(indexName.compareTo(BuilderFactory.HARDWARE_DEVICE_TYPE) == 0)) {
if (hits.score(x) < system_score_threshold) {
if (log.isDebugEnabled()) {
log.debug("hits.score(" + x + ") is " + hits.score(x));
log.debug("Filtering out search results from " + x + " to " +
hits.length() + ", due to their score being below " +
"system_score_threshold = " + system_score_threshold);
}
return false;
}
}
else if (indexName.compareTo(BuilderFactory.ERRATA_TYPE) == 0) {
if (guessMainQueryTerm.compareTo("name") == 0) {
if (hits.score(x) < errata_advisory_score_threshold) {
if (log.isDebugEnabled()) {
log.debug("hits.score(" + x + ") is " + hits.score(x));
log.debug("Filtering out search results from " + x + " to " +
hits.length() + ", due to their score being below " +
"errata_advisory_score_threshold = " +
errata_advisory_score_threshold);
}
return false;
}
}
else {
if (hits.score(x) < errata_score_threshold) {
if (log.isDebugEnabled()) {
log.debug("hits.score(" + x + ") is " + hits.score(x));
log.debug("Filtering out search results from " + x + " to " +
hits.length() + ", due to their score being below " +
"errata_score_threshold = " +
errata_score_threshold);
}
return false;
}
}
}
else if (((hits.score(x) < score_threshold) && (x > 10)) ||
(hits.score(x) < 0.001)) {
/**
* Dropping matches which are a poor fit.
* First term is configurable, it allows matches like spelling errors or
* suggestions to be possible.
* Second term is intended to get rid of pure and utter crap hits
*/
if (log.isDebugEnabled()) {
log.debug("hits.score(" + x + ") is " + hits.score(x));
log.debug("Filtering out search results from " + x + " to " +
hits.length() + ", due to their score being below " +
"score_threshold = " + score_threshold);
}
return false;
}
return true;
}
/**
* Removes any documents which are not related to the passed in Set of good value
* @param ids Set of ids of all known/good values
* @param indexName index name to operate on
* @param uniqField the name of the field in the Document to uniquely identify
* this record
* @return the number of documents deleted
*/
public int deleteRecordsNotInList(Set<String> ids, String indexName,
String uniqField) {
int count = 0;
IndexReader reader = null;
try {
reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG);
// Use maxDoc() to iterate over all docs, numDocs() returns the
// number of currently alive docs leaving out the deleted ones.
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
if (!reader.isDeleted(i)) {
Document doc = reader.document(i);
String uniqId = doc.getField(uniqField).stringValue();
if (!ids.contains(uniqId)) {
log.info(indexName + ":" + uniqField + ": <" + uniqId +
"> not found in list of current/good values " +
"assuming this has been deleted from Database and we " +
"should remove it.");
removeFromIndex(indexName, uniqField, uniqId);
count++;
}
}
}
}
catch (IOException e) {
e.printStackTrace();
log.info("deleteRecordsNotInList() caught exception : " + e);
}
catch (IndexingException e) {
e.printStackTrace();
log.info("deleteRecordsNotInList() caught exception : " + e);
}
finally {
if (reader != null) {
try {
reader.close();
}
catch (IOException e) {
//
}
}
}
return count;
}
private void debugExplainResults(String indexName, Hits hits, IndexSearcher searcher,
Query q, Set<Term> queryTerms)
throws IOException {
log.debug("Parsed Query is " + q.toString());
log.debug("Looking at index: " + indexName);
for (int i = 0; i < hits.length(); i++) {
if ((i < 10)) {
Document doc = hits.doc(i);
Float score = hits.score(i);
Explanation ex = searcher.explain(q, hits.id(i));
log.debug("Looking at hit<" + i + ", " + hits.id(i) + ", " + score +
">: " + doc);
log.debug("Explanation: " + ex);
MatchingField match = new MatchingField(q.toString(), doc, queryTerms);
String fieldName = match.getFieldName();
String fieldValue = match.getFieldValue();
log.debug("Guessing that matched fieldName is " + fieldName + " = " +
fieldValue);
}
}
}
private String getDocIndexPath(String lang) throws IOException {
String l = lookupLocale(lang);
if (!StringUtils.isBlank(l)) {
return BuilderFactory.DOCS_TYPE + File.separator + l;
}
log.error("Unable to find docs index dir for language " + lang);
throw new IOException("Unable to find docs index dir for language: " + lang);
}
private String lookupLocale(String lang) {
String ret = docLocaleLookUp.get(lang.toLowerCase());
if (StringUtils.isBlank(ret)) {
Locale l = new Locale(lang);
ret = docLocaleLookUp.get(l.getLanguage().toLowerCase());
}
return ret;
}
private Analyzer getDocAnalyzer(String lang) {
/**
* We want to use the same Analyzer nutch is using when the indexes are
* generated
* */
Analyzer analyzer = null;
try {
analyzer = nutchAnalyzerFactory.get(lang);
}
catch (Exception e) {
log.info("Caught exception, nutch is most likely not installed");
log.info("Defaulting to generic analyzer for Documentation Search");
log.info("Install nutch package to get summary info and better matches.");
analyzer = new StandardAnalyzer();
}
log.info("Language choice is " + lang + ", analyzer chosen is " +
analyzer);
return analyzer;
}
private Analyzer getServerAnalyzer() {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new
NGramAnalyzer(min_ngram, max_ngram));
analyzer.addAnalyzer("checkin", new KeywordAnalyzer());
analyzer.addAnalyzer("registered", new KeywordAnalyzer());
analyzer.addAnalyzer("ram", new KeywordAnalyzer());
analyzer.addAnalyzer("swap", new KeywordAnalyzer());
analyzer.addAnalyzer("cpuMHz", new KeywordAnalyzer());
analyzer.addAnalyzer("cpuNumberOfCpus", new KeywordAnalyzer());
return analyzer;
}
private Analyzer getErrataAnalyzer() {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new
NGramAnalyzer(min_ngram, max_ngram));
analyzer.addAnalyzer("advisoryName", new KeywordAnalyzer());
analyzer.addAnalyzer("synopsis", new StandardAnalyzer());
analyzer.addAnalyzer("description", new StandardAnalyzer());
analyzer.addAnalyzer("topic", new StandardAnalyzer());
analyzer.addAnalyzer("solution", new StandardAnalyzer());
return analyzer;
}
private Analyzer getSnapshotTagAnalyzer() {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new
NGramAnalyzer(min_ngram, max_ngram));
analyzer.addAnalyzer("id", new KeywordAnalyzer());
analyzer.addAnalyzer("snapshotId", new KeywordAnalyzer());
analyzer.addAnalyzer("orgId", new KeywordAnalyzer());
analyzer.addAnalyzer("serverId", new KeywordAnalyzer());
analyzer.addAnalyzer("tagNameId", new KeywordAnalyzer());
analyzer.addAnalyzer("created", new KeywordAnalyzer());
analyzer.addAnalyzer("modified", new KeywordAnalyzer());
return analyzer;
}
private Analyzer getHardwareDeviceAnalyzer() {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new
NGramAnalyzer(min_ngram, max_ngram));
analyzer.addAnalyzer("id", new KeywordAnalyzer());
analyzer.addAnalyzer("serverId", new KeywordAnalyzer());
analyzer.addAnalyzer("pciType", new KeywordAnalyzer());
return analyzer;
}
private Analyzer getServerCustomInfoAnalyzer() {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new
NGramAnalyzer(min_ngram, max_ngram));
analyzer.addAnalyzer("id", new KeywordAnalyzer());
analyzer.addAnalyzer("serverId", new KeywordAnalyzer());
analyzer.addAnalyzer("created", new KeywordAnalyzer());
analyzer.addAnalyzer("modified", new KeywordAnalyzer());
analyzer.addAnalyzer("createdBy", new KeywordAnalyzer());
analyzer.addAnalyzer("lastModifiedBy", new KeywordAnalyzer());
return analyzer;
}
private Analyzer getDefaultAnalyzer() {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new
NGramAnalyzer(min_ngram, max_ngram));
analyzer.addAnalyzer("id", new KeywordAnalyzer());
analyzer.addAnalyzer("arch", new KeywordAnalyzer());
analyzer.addAnalyzer("epoch", new KeywordAnalyzer());
analyzer.addAnalyzer("version", new KeywordAnalyzer());
analyzer.addAnalyzer("release", new KeywordAnalyzer());
analyzer.addAnalyzer("filename", new KeywordAnalyzer());
return analyzer;
}
private boolean initDocSummary() {
/**
* NOTE: NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml"
* to be available in the CLASSPATH
*/
try {
nutchConf = NutchConfiguration.create();
nutchAnalyzerFactory = new AnalyzerFactory(nutchConf);
FileSystem fs = FileSystem.get(nutchConf);
docSegments = new TreeMap<String, FetchedSegments>
(String.CASE_INSENSITIVE_ORDER);
for (String key : docLocaleLookUp.keySet()) {
String segmentsDir = indexWorkDir + File.separator +
getDocIndexPath(key) + File.separator + "segments";
FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf);
if (segments == null) {
log.info("Unable to create docSegments for language: " + key);
docSegments.put(key, null);
}
String[] segNames = segments.getSegmentNames();
if (segNames == null || segNames.length == 0) {
log.info("Unable to find any segments for language: " + key);
docSegments.put(key, null);
}
log.info("Adding Documentation segments for language: " + key);
docSegments.put(key, segments);
}
}
catch (Exception e) {
log.error("ignoring exception - most likely Nutch isn't present, so" +
" doc summaries will be empty");
e.printStackTrace();
}
return true;
}
private String lookupDocSummary(Document doc, String queryString, String lang) {
if (docSegments == null) {
log.info("docSegments is null, doc summary not possible");
log.info("nutch is probably not installed, install nutch to get summary info");
return "";
}
if (!docSegments.containsKey(lang)) {
log.info("Couldn't find segments info for " + lang);
log.info("Summary info will be missing for " + lang);
return "";
}
FetchedSegments segments = docSegments.get(lang);
if (segments == null) {
log.info("Segments info for " + lang + " is null");
return "";
}
try {
if (log.isDebugEnabled()) {
log.debug("Attempting lookupDocSummary<" + lang + "> for " + doc);
}
HitDetails hd = new HitDetails(doc.getField("segment").stringValue(),
doc.getField("url").stringValue());
// NOTE: Name conflict with Nutch's Query versus Lucene Query
org.apache.nutch.searcher.Query query =
org.apache.nutch.searcher.Query.parse(queryString, nutchConf);
Summary sum = segments.getSummary(hd, query);
if (log.isDebugEnabled()) {
log.debug("Will return summary<" + lang + "> = " + sum.toString());
}
return sum.toString();
}
catch (Exception e) {
log.info("Failed to lookupDocSummary<" + lang + ">, caught Exception: " + e);
e.printStackTrace();
}
return "";
}
private void initDocLocaleLookup() {
docLocaleLookUp.put("bn", "bn-IN");
docLocaleLookUp.put("bn_in", "bn-IN");
docLocaleLookUp.put("de", "de-DE");
docLocaleLookUp.put("en_us", "en-US");
docLocaleLookUp.put("en", "en-US");
docLocaleLookUp.put("es", "es-ES");
docLocaleLookUp.put("fr", "fr-FR");
docLocaleLookUp.put("gu", "gu-IN");
docLocaleLookUp.put("hi", "hi-IN");
docLocaleLookUp.put("it", "it-IT");
docLocaleLookUp.put("ja", "ja-JP");
docLocaleLookUp.put("ko", "ko-KR");
docLocaleLookUp.put("pa", "pa-IN");
docLocaleLookUp.put("pt_br", "pt-BR");
docLocaleLookUp.put("pt", "pt-BR");
docLocaleLookUp.put("pt_pt", "pt-BR");
docLocaleLookUp.put("ru", "ru-RU");
docLocaleLookUp.put("ta", "ta-IN");
docLocaleLookUp.put("zh", "zh-CN");
docLocaleLookUp.put("zh_cn", "zh-CN");
docLocaleLookUp.put("zh_tw", "zh-TW");
// Below exist in docs, but weren't available as a doc option from
// satellite webui and they weren't available Locales on my machine
// guessing at what they will look at.
docLocaleLookUp.put("as", "as-IN");
docLocaleLookUp.put("ml", "ml-IN");
docLocaleLookUp.put("mr", "mr-IN");
docLocaleLookUp.put("or", "or-IN");
docLocaleLookUp.put("kn", "kn-IN");
docLocaleLookUp.put("si_lk", "si-LK");
docLocaleLookUp.put("te", "te-IN");
}
}