package de.juwimm.cms.search.beans;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.transaction.annotation.Isolation;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.tizzit.util.XercesHelper;
import de.juwimm.cms.beans.foreign.TizzitPropertiesBeanSpring;
import de.juwimm.cms.common.Constants;
import de.juwimm.cms.model.ContentHbm;
import de.juwimm.cms.model.ContentHbmDao;
import de.juwimm.cms.model.ContentVersionHbm;
import de.juwimm.cms.model.DocumentHbm;
import de.juwimm.cms.model.DocumentHbmDao;
import de.juwimm.cms.model.HostHbm;
import de.juwimm.cms.model.HostHbmDao;
import de.juwimm.cms.model.SiteHbm;
import de.juwimm.cms.model.SiteHbmDao;
import de.juwimm.cms.model.UnitHbm;
import de.juwimm.cms.model.UnitHbmDao;
import de.juwimm.cms.model.ViewComponentHbm;
import de.juwimm.cms.model.ViewComponentHbmDao;
import de.juwimm.cms.model.ViewDocumentHbm;
import de.juwimm.cms.model.ViewDocumentHbmDao;
import de.juwimm.cms.safeguard.remote.SafeguardServiceSpring;
import de.juwimm.cms.search.lucene.IndexingMode;
import de.juwimm.cms.search.lucene.LuceneService;
import de.juwimm.cms.search.res.DocumentResourceLocatorFactory;
import de.juwimm.cms.search.res.HtmlDocumentLocator;
import de.juwimm.cms.search.vo.LinkDataValue;
import de.juwimm.cms.search.vo.SearchResultValue;
import de.juwimm.cms.search.vo.XmlSearchValue;
import de.juwimm.cms.search.xmldb.XmlDb;
/**
*
* @author <a href="mailto:j2ee@juwimm.com">Sascha-Matthias Kulawik</a>
* company Juwi|MacMillan Group GmbH, Walsrode, Germany
* @version $Id$
* @since cqcms-core 03.07.2009
*/
@Transactional(isolation = Isolation.READ_COMMITTED, propagation = Propagation.REQUIRED)
public class SearchengineService {
private static Logger log = Logger.getLogger(SearchengineService.class);
private static final String LUCENE_ESCAPE_CHARS = "[\\\\!\\(\\)\\:\\^\\]\\{\\}\\~]";
private static final Pattern LUCENE_PATTERN = Pattern.compile(LUCENE_ESCAPE_CHARS);
private static final String REPLACEMENT_STRING = "\\\\$0";
private static final int NUMBER_OF_SUGGESTIONS=5;
// @Autowired
// private Compass compass;
@Autowired
private LuceneService luceneService;
@Autowired
private XmlDb xmlDb;
// @Autowired
// private HtmlResourceLocator htmlResourceLocator;
@Autowired
private HtmlDocumentLocator htmlDocumentLocator;
@Autowired
private DocumentResourceLocatorFactory documentResourceLocatorFactory;
@Autowired
private SearchengineDeleteService searchengineDeleteService;
@Autowired
private SafeguardServiceSpring safeguardServiceSpring;
@Autowired
private TizzitPropertiesBeanSpring tizzitPropertiesBeanSpring;
private ViewComponentHbmDao viewComponentHbmDao;
private SiteHbmDao siteHbmDao;
private HostHbmDao hostHbmDao;
private DocumentHbmDao documentHbmDao;
private ContentHbmDao contentHbmDao;
private ViewDocumentHbmDao viewDocumentHbmDao;
private UnitHbmDao unitHbmDao;
private final HttpClient client = new HttpClient();
public SafeguardServiceSpring getSafeguardServiceSpring() {
return safeguardServiceSpring;
}
public void setSafeguardServiceSpring(SafeguardServiceSpring safeguardServiceSpring) {
this.safeguardServiceSpring = safeguardServiceSpring;
}
public void setViewComponentHbmDao(ViewComponentHbmDao viewComponentHbmDao) {
this.viewComponentHbmDao = viewComponentHbmDao;
}
public void setSiteHbmDao(SiteHbmDao siteHbmDao) {
this.siteHbmDao = siteHbmDao;
}
public void setHostHbmDao(HostHbmDao hostHbmDao) {
this.hostHbmDao = hostHbmDao;
}
public void setDocumentHbmDao(DocumentHbmDao documentHbmDao) {
this.documentHbmDao = documentHbmDao;
}
public void setContentHbmDao(ContentHbmDao contentHbmDao) {
this.contentHbmDao = contentHbmDao;
}
public void setViewDocumentHbmDao(ViewDocumentHbmDao viewDocumentHbmDao) {
this.viewDocumentHbmDao = viewDocumentHbmDao;
}
public void setUnitHbmDao(UnitHbmDao unitHbmDao) {
this.unitHbmDao = unitHbmDao;
}
public ViewComponentHbmDao getViewComponentHbmDao() {
return viewComponentHbmDao;
}
public SiteHbmDao getSiteHbmDao() {
return siteHbmDao;
}
public HostHbmDao getHostHbmDao() {
return hostHbmDao;
}
public DocumentHbmDao getDocumentHbmDao() {
return documentHbmDao;
}
public ContentHbmDao getContentHbmDao() {
return contentHbmDao;
}
public ViewDocumentHbmDao getViewDocumentHbmDao() {
return viewDocumentHbmDao;
}
public UnitHbmDao getUnitHbmDao() {
return unitHbmDao;
}
/**
* @see de.juwimm.cms.search.remote.SearchengineServiceSpring#searchXML(java.lang.Integer, java.lang.String)
*/
@Transactional(readOnly = true)
public XmlSearchValue[] searchXML(Integer siteId, String xpathQuery) throws Exception {
XmlSearchValue[] retString = null;
retString = xmlDb.searchXml(siteId, xpathQuery);
return retString;
}
/**
* @see de.juwimm.cms.search.remote.SearchengineServiceSpring#startIndexer()
*/
public void startIndexer() throws Exception {
try {
Date start = new Date();
Collection sites = getSiteHbmDao().findAll();
Iterator itSites = sites.iterator();
while (itSites.hasNext()) {
SiteHbm site = (SiteHbm) itSites.next();
if (log.isDebugEnabled()) log.debug("Starting with Site " + site.getName() + " (" + site.getSiteId() + ")");
reindexSite(site.getSiteId());
}
Date end = new Date();
if (log.isInfoEnabled()) log.info(end.getTime() - start.getTime() + " total milliseconds");
} catch (Exception e) {
log.error("Caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
/**
* @see de.juwimm.cms.search.remote.SearchengineServiceSpring#reindexSite(java.lang.Integer)
*/
@SuppressWarnings("unchecked")
public void reindexSite(Integer siteId) throws Exception {
if (log.isDebugEnabled()) log.debug("Starting reindexing site " + siteId);
SiteHbm site = getSiteHbmDao().load(siteId);
//if site with external site search then schedule it to the ExternalSitesCronService for indexing
if (site.getExternalSiteSearch() != null && site.getExternalSiteSearch()) {
site.setUpdateSiteIndex(true);
return;
}
// CompassSession session = compass.openSession();
// CompassQuery query = session.queryBuilder().term("siteId", siteId);
luceneService.removeDocument(new Term("siteId", siteId+""));
Date start = new Date();
try {
Collection vdocs = getViewDocumentHbmDao().findAll(siteId);
Iterator itVdocs = vdocs.iterator();
while (itVdocs.hasNext()) {
ViewDocumentHbm vdl = (ViewDocumentHbm) itVdocs.next();
if (log.isDebugEnabled()) log.debug("- Starting ViewDocument: " + vdl.getLanguage() + " " + vdl.getViewType());
ViewComponentHbm rootvc = vdl.getViewComponent();
if(tizzitPropertiesBeanSpring.getSearch().isUseOracleBatchUpdate()){
setUpdateSearchIndex4AllVCsUsingQuery(rootvc);
} else {
setUpdateSearchIndex4AllVCs(rootvc);
}
}
} catch (Exception e) {
log.error("Caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
try {
Collection<UnitHbm> units = getUnitHbmDao().findBySite(siteId);
for (UnitHbm unit : units) {
Collection<DocumentHbm> docs = getDocumentHbmDao().findAllPerUnit(unit.getUnitId());
for (DocumentHbm doc : docs) {
// -- Indexing through No-Messaging
doc.setUpdateSearchIndex(true);
}
Collection<ViewComponentHbm> allViewComponentsForUnit=new ArrayList<ViewComponentHbm>();
Collection<ViewComponentHbm> rootViewComponents=getViewComponentHbmDao().find4Unit(unit.getUnitId());
for (ViewComponentHbm viewComponentHbm : rootViewComponents) {
allViewComponentsForUnit.addAll(viewComponentHbm.getAllChildrenOfUnit());
}
docs=new ArrayList<DocumentHbm>();
for (ViewComponentHbm viewComponentHbm : allViewComponentsForUnit) {
docs.addAll(getDocumentHbmDao().findAllPerViewComponent(viewComponentHbm.getViewComponentId()));
}
for (DocumentHbm doc : docs) {
// -- Indexing through No-Messaging
doc.setUpdateSearchIndex(true);
}
}
} catch (Exception e) {
log.error("Caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
Date end = new Date();
if (log.isInfoEnabled()) log.info(end.getTime() - start.getTime() + " total milliseconds for site " + siteId);
if (log.isDebugEnabled()) log.debug("finished index for site " + siteId);
}
private void setUpdateSearchIndex4AllVCs(ViewComponentHbm viewComponent) {
if (!"DUMMY".equals(viewComponent.getReference())) {
if (viewComponent.getViewType() == Constants.VIEW_TYPE_CONTENT || viewComponent.getViewType() == Constants.VIEW_TYPE_UNIT) {
ContentHbm content = null;
try {
content = getContentHbmDao().load(new Integer(viewComponent.getReference()));
// -- Indexing through No-Messaging
content.setUpdateSearchIndex(true);
} catch (Exception exe) {
log.warn("Could not resolve Content with Id: " + viewComponent.getReference(), exe);
}
}
if (!viewComponent.isLeaf()) {
Collection children = viewComponent.getChildren();
Iterator itChildren = children.iterator();
while (itChildren.hasNext()) {
ViewComponentHbm child = (ViewComponentHbm) itChildren.next();
setUpdateSearchIndex4AllVCs(child);
}
}
}
}
private void setUpdateSearchIndex4AllVCsUsingQuery(ViewComponentHbm viewComponent) {
log.debug("Using oracle batch update for view component id "+viewComponent.getViewComponentId());
getViewComponentHbmDao().bulkUpdateForSearchengine(viewComponent.getViewComponentId());
}
private LinkDataValue[] getLinkData(Integer siteId, DocumentHbm doc) {
Collection<LinkDataValue> linkDataList = new ArrayList<LinkDataValue>();
try {
XmlSearchValue[] xmlData = this.searchXML(siteId, "//document[@src=\"" + doc.getDocumentId().toString() + "\"]");
if (xmlData != null && xmlData.length > 0) {
for (int i = (xmlData.length - 1); i >= 0; i--) {
LinkDataValue ldv = new LinkDataValue();
ldv.setUnitId(xmlData[i].getUnitId());
ldv.setViewComponentId(xmlData[i].getViewComponentId());
try {
UnitHbm unit = getUnitHbmDao().load(ldv.getUnitId());
ldv.setUnitName(unit.getName());
} catch (Exception e) {
log.error("Error loading Unit " + ldv.getUnitId() + ": Perhaps SearchIndex is corrupt? " + e.getMessage(), e);
}
try {
ViewComponentHbm viewComponent = getViewComponentHbmDao().load(ldv.getViewComponentId());
ldv.setViewComponentPath(viewComponent.getPath());
ViewDocumentHbm viewDocument = viewComponent.getViewDocument();
ldv.setLanguage(viewDocument.getLanguage());
ldv.setViewType(viewDocument.getViewType());
} catch (Exception e) {
log.error("Error loading ViewComponent " + ldv.getViewComponentId() + ": Perhaps SearchIndex is corrupt? " + e.getMessage(), e);
}
linkDataList.add(ldv);
}
}
} catch (Exception e) {
log.error("Error loading link-data for document " + doc.getDocumentId() + ": " + e.getMessage(), e);
}
return linkDataList.toArray(new LinkDataValue[0]);
}
@Transactional(readOnly = true)
public XmlSearchValue[] searchXmlByUnit(Integer unitId, Integer viewDocumentId, String xpathQuery, boolean parentSearch) throws Exception {
XmlSearchValue[] retString = null;
if (parentSearch) {
try {
ViewComponentHbm vc = getViewComponentHbmDao().find4Unit(unitId, viewDocumentId);
if (vc != null && !vc.isRoot()) {
unitId = vc.getParent().getUnit4ViewComponent();
}
} catch (Exception e) {
log.error("Error finding VC by unit " + unitId + " and viewDocument " + viewDocumentId + " in searchXmlByUnit!", e);
}
}
retString = xmlDb.searchXmlByUnit(unitId, viewDocumentId, xpathQuery);
return retString;
}
@Transactional(readOnly = true)
public SearchResultValue[] searchWeb(Integer siteId,Integer unitId, final String searchItem, Integer pageSize, Integer pageNumber, Map safeGuardCookieMap, String searchUrl, boolean isLiveServer, int fragmentSize) throws Exception {
if (pageSize != null && pageSize.intValue() <= 1) pageSize = new Integer(20);
if (fragmentSize <= 1) fragmentSize = 100;
if (pageNumber != null && pageNumber.intValue() < 0) pageNumber = new Integer(0); // first page
SearchResultValue[] staticRetArr = null;
Vector<SearchResultValue> retArr = new Vector<SearchResultValue>();
if (log.isDebugEnabled()) log.debug("starting compass-search");
try {
if (log.isDebugEnabled()) log.debug("searchurl is: " + searchUrl);
if (log.isDebugEnabled()) log.debug("search for: \"" + searchItem + "\"");
if (log.isDebugEnabled()) log.debug("SonderZeicheVergleich (ae, oe, ue, ss): ä ö ü ß");
//TODO: find calls of searchWeb and ADD exception handling
//special chars with a meaning in Lucene have to be escaped - there is a mechanism for
//that in Lucene - QueryParser.escape but I don't want to replace '*','?','+','-' in searchItem
String searchItemEsc = LUCENE_PATTERN.matcher(searchItem).replaceAll(REPLACEMENT_STRING);
String searchUrlEsc = null;
if (searchUrl != null) {
searchUrlEsc = LUCENE_PATTERN.matcher(searchUrl).replaceAll(REPLACEMENT_STRING);
}
if (log.isDebugEnabled() && !searchItem.equalsIgnoreCase(searchItemEsc)) {
log.debug("search for(escaped form): \"" + searchItemEsc + "\"");
}
//per default searchItems get connected by AND (compare CompassSettings.java)
Query query = buildRatedWildcardQuery(siteId,unitId, searchItemEsc, searchUrlEsc, safeGuardCookieMap,isLiveServer);
if (log.isDebugEnabled()) log.debug("search for query: " + query.toString());
TopScoreDocCollector collector = luceneService.search(query);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
QueryScorer scorer = new QueryScorer(query, "contents");
Highlighter highlighter = new Highlighter(scorer);
// highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
if (log.isDebugEnabled()) log.debug(hits.length + " results found");
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document resource = luceneService.getDocument(docId);
SearchResultValue retVal = new SearchResultValue();
retVal.setScore((int) (hits[i].score * 100.0f));
//CompassHighlightedText text = hits[i].getHighlightedText();
String contents = resource.get("contents");
TokenStream stream =
TokenSources.getAnyTokenStream(luceneService.getIndexReader(),
docId,
"contents",
resource,
new StandardAnalyzer(Version.LUCENE_36));
SimpleFragmenter fragmenter=new SimpleFragmenter(fragmentSize);
highlighter.setTextFragmenter(fragmenter);
String fragment =
highlighter.getBestFragment(stream, contents);
retVal.setSummary(stripNonValidXMLCharacters(fragment/*hits[i].getHighlightedText().getHighlightedText("contents"))*/));
retVal.setUnitId(new Integer(resource.get("unitId")));
retVal.setUnitName(resource.get("unitName"));
retVal.setPageSize(pageSize);
retVal.setPageNumber(pageNumber);
retVal.setDuration(0l);
retVal.setPageAmount(null);
retVal.setTotalHits(collector.getTotalHits());
String alias=resource.get("alias");
if ("HtmlSearchValue".equalsIgnoreCase(alias)) {
String url = resource.get("url");
if (url != null) {
retVal.setUrl(url);
retVal.setTitle(resource.get("title"));
retVal.setLanguage(resource.get("language"));
retVal.setViewType(resource.get("viewtype"));
retVal.setIsLiveContent(Boolean.parseBoolean(resource.get("isLiveContent")));
String template = resource.get("template");
retVal.setTemplate(template != null ? template : "standard");
}
} else {
String documentId = resource.get("documentId");
Integer intDocumentId = null;
try {
intDocumentId = Integer.valueOf(documentId);
} catch (Exception e) {
log.warn("Error converting documentId: " + e.getMessage());
if (log.isDebugEnabled()) log.debug(e);
}
if (intDocumentId != null) {
retVal.setDocumentId(docId);
retVal.setDocumentName(resource.get("documentName"));
retVal.setMimeType(resource.get("mimeType"));
retVal.setTimeStamp(resource.get("timeStamp"));
}
}
retArr.add(retVal);
}
} catch (BooleanQuery.TooManyClauses tmc) {
StringBuffer sb = new StringBuffer(256);
sb.append("BooleanQuery.TooManyClauses Exception. ");
sb.append("This typically happens if a PrefixQuery, FuzzyQuery, WildcardQuery, or RangeQuery is expanded to many terms during search. ");
sb.append("Possible solution: BooleanQuery.setMaxClauseCount() > 1024(default)");
sb.append(" or reform the search querry with more letters and less wildcards (*).");
log.error(sb.toString(), tmc);
//Exception gets thrown again to allow the caller to react
throw tmc;
} catch (Exception e) {
log.error("Error performing search with compass: " + e.getMessage(), e);
}
if (log.isDebugEnabled()) log.debug("finished compass-search");
staticRetArr = new SearchResultValue[retArr.size()];
retArr.toArray(staticRetArr);
return staticRetArr;
}
/**
* This method provides search suggestions for the <b>searchItem</b> input string.
* The search suggestions are limited to the site mentioned in the <b>siteId</b>
* parameter and for the authorization tokens provided in the <b>safeGuardCookieMap</b>
*
* @param siteId
* @param searchItem
* @param safeGuardCookieMap
* @return a {@link String} matrix of maximum 10 lines and exactly 2 columns. Each line contains a search suggestion, the first column contains the suggestion string and the second column the number of hits for the suggestion in the first column.
* @throws Exception
*/
@Transactional(readOnly = true)
public String[][] searchWebSuggestions(Integer siteId,Integer unitId, final String searchItem, Map safeGuardCookieMap) throws Exception {
String[] results=luceneService.getSpellChecker().suggestSimilar(searchItem, 20, 0.5f);
log.info(results.toString());
String[][] resultValue=new String[NUMBER_OF_SUGGESTIONS][2];
int max=0;
for (int i = 0; i < results.length; i++) {
Query query = buildRatedWildcardQuery(siteId,unitId, results[i], null, safeGuardCookieMap,false);
TopScoreDocCollector collector = luceneService.search(query);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
if(hits.length>0){
resultValue[max][0]=results[i];
resultValue[max][1]=String.valueOf(hits.length);
max++;
}
if(max>=NUMBER_OF_SUGGESTIONS){
break;
}
}
return resultValue;
}
public String stripNonValidXMLCharacters(String in) {
if (in == null) return "";
String stripped = in.replaceAll("[^\\u0009\\u000a\\u000d\\u0020-\\ud7ff\\e0000-\\ufffd]", "");
return stripped;
}
@SuppressWarnings("unchecked")
private Query buildRatedWildcardQuery(Integer siteId,Integer unitId, String searchItem, String searchUrl, Map safeGuard, boolean isLiveServer) throws IOException, ParseException, InstantiationException, IllegalAccessException, ClassNotFoundException {
Analyzer analyzer = null;
// CompassBooleanQueryBuilder queryBuilder = session.queryBuilder().bool();
IndexReader ir = IndexReader.open(luceneService.getDirectory());//LuceneHelper.getLuceneInternalSearch(session).getReader();
Query query;
QueryParser parser;
BooleanQuery booleanQuery=new BooleanQuery();
try {
// String analyzerClass = session.getSettings().getSetting("compass.engine.analyzer.search.type");
// Constructor<Analyzer> analyzerConstructor = (Constructor<Analyzer>) (Class.forName(analyzerClass)).getConstructor();
analyzer = new StandardAnalyzer(Version.LUCENE_36);//analyzerConstructor.newInstance();
if (log.isInfoEnabled()) log.info("Created search analyzer - class is: " + analyzer.getClass().getName());
} catch (Exception e) {
log.error("Error while instantiating search analyzer from compass settings - going on with StandardAnalyzer");
analyzer = new StandardAnalyzer(Version.LUCENE_36);
}
if (searchUrl != null) {
if (log.isDebugEnabled()) log.debug("to search all hosts and subpages attaching * at start and end of urlString...");
// search on this side and all sub sites on all hosts
searchUrl = "*" + searchUrl + "*";
if (log.isDebugEnabled()) log.debug("urlString before parsing: " + searchUrl);
parser = new QueryParser(Version.LUCENE_36,"url", analyzer);
parser.setAllowLeadingWildcard(true);
query = parser.parse(searchUrl).rewrite(ir);
// queryBuilder.addMust(LuceneHelper.createCompassQuery(session, query));
booleanQuery.add(query, BooleanClause.Occur.MUST);
}
if(unitId!=null){
query = new QueryParser(Version.LUCENE_36,"unitId", analyzer).parse(unitId.toString());
// queryBuilder.addMust(LuceneHelper.createCompassQuery(session, query));
booleanQuery.add(query, BooleanClause.Occur.MUST);
} else {
query = new QueryParser(Version.LUCENE_36,"siteId", analyzer).parse(siteId.toString());
// queryBuilder.addMust(LuceneHelper.createCompassQuery(session, query));
booleanQuery.add(query, BooleanClause.Occur.MUST);
}
if(isLiveServer){
query = new QueryParser(Version.LUCENE_36,"isLiveContent", analyzer).parse("true");
// queryBuilder.addMust(LuceneHelper.createCompassQuery(session, query));
booleanQuery.add(query, BooleanClause.Occur.MUST);
}
BooleanQuery subQuery=new BooleanQuery();
String searchFields[] = {"metadata", "url", "title", "contents"};
for (int i = 0; i < searchFields.length; i++) {
if (i == (searchFields.length - 1) && searchUrl != null) {
searchItem = searchItem + " " + searchUrl;
}
if (log.isDebugEnabled()) {
log.debug("wildcart query string: " + searchItem);
}
parser = new QueryParser(Version.LUCENE_36,searchFields[i], analyzer);
parser.setAllowLeadingWildcard(true);
query = parser.parse(searchItem);
if (log.isDebugEnabled()) log.debug("wildcart query part: " + query.toString());
query = query.rewrite(ir);
query.setBoost(searchFields.length - i);
if (log.isDebugEnabled()) log.debug("wildcart query part - rewritten: " + query.toString());
// subQueryBuilder.addShould(LuceneHelper.createCompassQuery(session, query));
subQuery.add(query, BooleanClause.Occur.SHOULD);
}
booleanQuery.add(subQuery,BooleanClause.Occur.MUST);
subQuery = new BooleanQuery();
if (safeGuard != null && !safeGuard.keySet().isEmpty()) {
if (log.isDebugEnabled()) log.debug("Safeguard found - adding realms to query.");
Iterator it = safeGuard.keySet().iterator();
while (it.hasNext()) {
String key = new String(((String) it.next()).getBytes("ISO-8859-1"));
if (key.trim().equalsIgnoreCase("")) continue;
if (log.isDebugEnabled()) log.debug("adding realm: " + key);
parser = new QueryParser(Version.LUCENE_36,"realm", analyzer);
query = parser.parse(key);
// subQueryBuilder.addShould(LuceneHelper.createCompassQuery(session, query));
subQuery.add(query, BooleanClause.Occur.SHOULD);
}
// adding null representation to find pages without security
parser = new QueryParser(Version.LUCENE_36,"realm", analyzer);
query = parser.parse(Constants.SEARCH_INDEX_NULL);
subQuery.add(query,BooleanClause.Occur.SHOULD);
if (subQuery != null) booleanQuery.add(subQuery,BooleanClause.Occur.MUST);
}
return booleanQuery;
}
@Transactional(propagation = Propagation.REQUIRES_NEW)
public void indexPage(Integer contentId) {
ContentHbm content = getContentHbmDao().load(contentId);
IndexingMode mode=luceneService.getIndexingMode();
ContentVersionHbm contentVersion = null;
ContentVersionHbm contentLiveVersion=null;
if(mode==IndexingMode.WORK || mode==IndexingMode.BOTH)
contentVersion=content.getLastContentVersion();
if(mode==IndexingMode.LIVE || mode==IndexingMode.BOTH)
contentLiveVersion = content.getContentVersionForPublish();
if (contentVersion == null && contentLiveVersion == null) {
log.error("ContentVersion not existing for content: " + content.getContentId());
content.setUpdateSearchIndex(false);
return;
}
if((contentVersion!=null && contentVersion.getLock()!=null) || (contentLiveVersion!=null && contentLiveVersion.getLock()!=null) ){
log.error("Skipping index to avoid deadlock. ContentVersion is locked: " + content.getContentId());
return;
}
String contentText = null;
String contentLiveText = null;
if (contentVersion != null) contentText = contentVersion.getText();
if (contentLiveVersion != null) contentLiveText = contentLiveVersion.getText();
Collection vclColl = getViewComponentHbmDao().findByReferencedContent(content.getContentId().toString());
Iterator it = vclColl.iterator();
while (it.hasNext()) {
ViewComponentHbm viewComponent = (ViewComponentHbm) it.next();
log.info("indexing content for referenced vc id "+viewComponent.getViewComponentId()+"("+viewComponent.getDisplayLinkName()+")");
if (log.isDebugEnabled()) log.debug("Updating Indexes for VCID " + viewComponent.getDisplayLinkName());
boolean hasLivePreview = (getLiveUrl(viewComponent) != null);
// boolean hasLivePreview = (viewComponent.getViewDocument().getSite().getPreviewUrlLiveServer() != null);
boolean hasWorkPreview = (viewComponent.getViewDocument().getSite().getPreviewUrlWorkServer() != null);
if (viewComponent.isSearchIndexed()) {
if (hasWorkPreview && contentText != null) this.indexPage4Lucene(viewComponent, contentText, false);
if (hasLivePreview && contentLiveText != null) this.indexPage4Lucene(viewComponent, contentLiveText, true);
} else {
if (hasLivePreview) searchengineDeleteService.deletePage4Lucene(viewComponent, true);
if (hasWorkPreview) searchengineDeleteService.deletePage4Lucene(viewComponent, false);
}
if (viewComponent.isXmlSearchIndexed()) {
if (contentText != null)
this.indexPage4Xml(viewComponent, contentText);
else
this.indexPage4Xml(viewComponent, contentLiveText);
} else {
searchengineDeleteService.deletePage4Xml(viewComponent);
}
Collection referencingViewComponents = getViewComponentHbmDao().findByReferencedViewComponent(viewComponent.getViewComponentId().toString());
Iterator itRef = referencingViewComponents.iterator();
while (itRef.hasNext()) {
ViewComponentHbm refViewComponent = (ViewComponentHbm) itRef.next();
log.info("indexing content for referencing vc id "+refViewComponent.getViewComponentId()+"("+refViewComponent.getDisplayLinkName()+")");
hasLivePreview = (getLiveUrl(refViewComponent) != null);
// hasWorkPreview = (refViewComponent.getViewDocument().getSite().getPreviewUrlLiveServer() != null);
hasWorkPreview = (refViewComponent.getViewDocument().getSite().getPreviewUrlWorkServer() != null);
if (refViewComponent.getViewType() == Constants.VIEW_TYPE_SYMLINK) {
// acts as normal content
if (log.isDebugEnabled()) log.debug("trying to index symLink " + refViewComponent.getDisplayLinkName());
if (refViewComponent.isSearchIndexed()) {
if (hasLivePreview && contentLiveText != null) this.indexPage4Lucene(refViewComponent, contentText, true);
if (hasWorkPreview && contentText != null) this.indexPage4Lucene(refViewComponent, contentLiveText, false);
} else {
if (hasLivePreview) searchengineDeleteService.deletePage4Lucene(refViewComponent, true);
if (hasWorkPreview) searchengineDeleteService.deletePage4Lucene(refViewComponent, false);
}
if (refViewComponent.isXmlSearchIndexed()) {
if (contentText != null)
this.indexPage4Xml(refViewComponent, contentText);
else
this.indexPage4Xml(refViewComponent, contentLiveText);
} else {
searchengineDeleteService.deletePage4Xml(refViewComponent);
}
}
/* Die Referenzen im Tree KÖNNEN nur über "findByReferencedViewComponent"
gefunden werden, und nicht über die XML Suchmaschine.
*/
}
}
content.setUpdateSearchIndex(false);
}
private void indexPage4Lucene(ViewComponentHbm viewComponent, String contentText, boolean isLive) {
if (log.isDebugEnabled()) log.debug("Lucene-Index create / update for VC " + viewComponent.getViewComponentId());
ViewDocumentHbm vdl = viewComponent.getViewDocument();
// CompassSession session = null;
// CompassTransaction tx = null;
File file = null;
try {
String currentUrl =null;
if(isLive){
currentUrl=getLiveUrl(viewComponent);
} else {
currentUrl= searchengineDeleteService.getUrl(viewComponent, isLive);
}
file = this.downloadFile(currentUrl);
if (file != null) {
String cleanUrl = viewComponent.getViewDocument().getSite().getPageNameSearch();
// cut of host name too - since it could be an other one
//String cleanUrl = viewComponent.getViewDocument().getLanguage() + "/" + viewComponent.getPath();
cleanUrl = currentUrl.substring(0, currentUrl.length() - cleanUrl.length());
// session = compass.openSession();
Document resource = htmlDocumentLocator.getResource(file, cleanUrl, new Date(System.currentTimeMillis()), viewComponent, vdl);
resource.add(new Field("isLiveContent", new Boolean(isLive).toString(), Field.Store.YES, Field.Index.ANALYZED));
luceneService.addToIndex(resource);
} else {
log.warn("Critical Error during indexPage4Lucene - cound not find Ressource: " + currentUrl);
}
} catch (Exception e) {
log.warn("Error indexPage4Lucene, VCID " + viewComponent.getViewComponentId().toString() + ": " + e.getMessage(), e);
// if (tx != null) tx.rollback();
} finally {
// if (session != null) session.close();
//delete temp file
if (file != null) {
file.delete();
}
}
if (log.isDebugEnabled()) log.debug("finished indexPage4Lucene");
}
private void indexPage4Xml(ViewComponentHbm viewComponent, String contentText) {
if (log.isDebugEnabled()) log.debug("XML-Index create / update for VC " + viewComponent.getViewComponentId());
ViewDocumentHbm vdl = viewComponent.getViewDocument();
SiteHbm site = vdl.getSite();
org.w3c.dom.Document wdoc = null;
try {
wdoc = XercesHelper.string2Dom(contentText);
} catch (Throwable t) {
}
if (wdoc != null) {
HashMap<String, String> metaAttributes = new HashMap<String, String>();
if (log.isDebugEnabled()) {
log.debug("SearchUpdateMessageBeanImpl.indexVC(...) -> infoText = " + viewComponent.getLinkDescription());
log.debug("SearchUpdateMessageBeanImpl.indexVC(...) -> text = " + viewComponent.getDisplayLinkName());
log.debug("SearchUpdateMessageBeanImpl.indexVC(...) -> unitId = " + viewComponent.getUnit4ViewComponent());
}
metaAttributes.put("infoText", viewComponent.getLinkDescription());
metaAttributes.put("text", viewComponent.getDisplayLinkName());
metaAttributes.put("unitId", (viewComponent.getUnit4ViewComponent() == null ? "" : viewComponent.getUnit4ViewComponent().toString()));
xmlDb.saveXml(site.getSiteId(), viewComponent.getViewComponentId(), contentText, metaAttributes);
}
if (log.isDebugEnabled()) log.debug("finished indexPage4Xml");
}
private File downloadFile(String strUrl) {
try {
File file = File.createTempFile("indexingTempFile", "html");
file.deleteOnExit();
client.getParams().setConnectionManagerTimeout(20000);
HttpMethod method = new GetMethod(strUrl);
method.setFollowRedirects(true);
method.getParams().setBooleanParameter(HttpMethodParams.USE_EXPECT_CONTINUE, true);
for (int i = 0; i <= 3; i++) {
if (i == 3) {
log.error("Trying to fetch the URL " + strUrl + " three times with no success - page could not be loaded in searchengine!");
return null;
}
if (log.isDebugEnabled()) log.debug("Trying " + i + " to catch the URL");
if (this.downloadToFile(method, file, strUrl)) {
if (log.isDebugEnabled()) log.debug("got it!");
break;
}
}
if (log.isDebugEnabled()) log.debug("downloadFile: FINISHED");
return file;
} catch (Exception exe) {
return null;
}
}
private boolean downloadToFile(HttpMethod method, File file, String strUrl) {
boolean retVal = false;
try {
client.executeMethod(method);
String responseBody=method.getResponseBodyAsString();
OutputStream out = new FileOutputStream(file);
Closeable streamOut = out;
try {
out.write(responseBody.getBytes());
} finally {
if (streamOut != null) {
try {
streamOut.close();
} catch (IOException e) {
}
}
}
retVal = true;
} catch (HttpException he) {
log.error("Http error connecting to '" + strUrl + "'");
log.error(he.getMessage());
} catch (IOException ioe) {
log.error("Unable to connect to '" + strUrl + "'");
}
//clean up the connection resources
method.releaseConnection();
return retVal;
}
public void indexDocument(Integer documentId) {
if (log.isDebugEnabled()) {
log.debug("Index create / update for Document: " + documentId);
}
DocumentHbm document = getDocumentHbmDao().load(documentId);
if (log.isDebugEnabled()) {
log.debug("Document " + document.getDocumentId() + " \"" + document.getDocumentName() + "\" \"" + document.getMimeType());
}
if (!documentResourceLocatorFactory.isSupportedFileFormat(document.getMimeType())) {
if (log.isInfoEnabled()) log.info("Document " + document.getDocumentId() + " \"" + document.getDocumentName() + "\" \"" + document.getMimeType() + "\" is not supported, skipping...");
document.setUpdateSearchIndex(false);
return;
}
try {
Document resource = documentResourceLocatorFactory.getResource(document);
luceneService.addToIndex(resource);
} catch (Exception e) {
log.warn("Error indexDocument " + document.getDocumentId().toString() + ": " + e.getMessage());
} finally {
document.setUpdateSearchIndex(false);
}
if (log.isInfoEnabled()) log.info("finished indexDocument " + document.getDocumentId() + " \"" + document.getDocumentName() + "\"");
}
public String getLiveUrl(ViewComponentHbm viewComponent) {
SiteHbm siteHbm=viewComponent.getViewDocument().getSite();
Collection<HostHbm> hostHbmCollection=getHostHbmDao().findAll(siteHbm.getSiteId());
String url = "";
for (HostHbm hostHbm : hostHbmCollection) {
if(hostHbm.isLiveserver()){
url+="http://"+hostHbm.getHostName()+"/";
break;
}
}
if (url.isEmpty()) {
return null;
} else {
url += viewComponent.getViewDocument().getLanguage() + "/" + viewComponent.getPath() + "." + siteHbm.getPageNameSearch();
if (log.isInfoEnabled()) log.info("created url " + url + " for site " + siteHbm.getName());
return url;
}
}
}