/** * Copyright (c) 2009 Juwi MacMillan Group GmbH * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.juwimm.cms.search.res; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringWriter; import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.Properties; import java.util.StringTokenizer; import org.apache.log4j.Logger; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.springframework.beans.factory.annotation.Autowired; import org.tizzit.util.XercesHelper; import de.juwimm.cms.common.Constants; import de.juwimm.cms.model.ContentHbm; import de.juwimm.cms.model.ContentHbmDao; import de.juwimm.cms.model.UnitHbm; import de.juwimm.cms.model.UnitHbmDao; import de.juwimm.cms.model.ViewComponentHbm; import de.juwimm.cms.model.ViewDocumentHbm; import de.juwimm.cms.safeguard.model.Realm2viewComponentHbm; import de.juwimm.cms.safeguard.model.Realm2viewComponentHbmDao; import de.juwimm.cms.safeguard.model.RealmJaasHbm; import de.juwimm.cms.safeguard.model.RealmJdbcHbm; import de.juwimm.cms.safeguard.model.RealmLdapHbm; import de.juwimm.cms.safeguard.model.RealmSimplePwHbm; import de.juwimm.cms.search.res.html.HTMLParser; /** * Helper for getting injected the Compass-Instances from Spring and offering getters for them * @author <a href="mailto:carsten.schalm@juwimm.com">Carsten Schalm</a> * company Juwi|MacMillan Group Gmbh, Walsrode, Germany * @version $Id$ */ public class HtmlDocumentLocator { private static Logger log = Logger.getLogger(HtmlDocumentLocator.class); @Autowired private UnitHbmDao unitHbmDao; @Autowired private ContentHbmDao contentHbmDao; @Autowired private Realm2viewComponentHbmDao realm2VcHbmDao; public Document getResource(File file, String url, Date modifiedDate, ViewComponentHbm vcl, ViewDocumentHbm vdl) throws IOException, InterruptedException { String cleanUrl = vdl.getLanguage() + "/" + vcl.getPath(); Document resource = new Document();// resourceFactory.createResource("HtmlSearchValue"); resource.add(new Field("alias", "HtmlSearchValue", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)); resource.add(new Field("url", cleanUrl, Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("siteId", vdl.getSite().getSiteId().toString(), Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("language", vdl.getLanguage(), Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("viewtype", vdl.getViewType(), Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("modified", DateTools.timeToString(modifiedDate.getTime(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("uid", url, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)); resource.add(new Field("viewComponentId", vcl.getViewComponentId().toString(), Field.Store.YES, Field.Index.ANALYZED)); try { Integer unitId = vcl.getViewComponentUnit().getUnit4ViewComponent(); if (unitId != null) { UnitHbm unit = unitHbmDao.load(unitId); resource.add(new Field("unitId", unit.getUnitId().toString(), Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("unitName", unit.getName(), Field.Store.YES, Field.Index.ANALYZED)); } } catch (Exception exe) { if (log.isDebugEnabled()) log.debug("unitId could not be loaded for vcId: " + vcl.getViewComponentId(), exe); } try { ContentHbm content = contentHbmDao.load(new Integer(vcl.getReference())); resource.add(new Field("template", content.getTemplate(), Field.Store.YES, Field.Index.ANALYZED)); } catch (Exception exe) { if (log.isDebugEnabled()) log.debug("template could not be loaded for vcId: " + vcl.getViewComponentId(), exe); } if (log.isDebugEnabled()) log.debug("looking for realm now: "); try { String realm = getRealm4Vc(vcl); if (realm != null) { if (log.isDebugEnabled()) log.debug("realm is: " + realm); resource.add(new Field("realm", realm, Field.Store.YES, Field.Index.ANALYZED)); } else { if (log.isDebugEnabled()) log.debug("page not protected - adding 'nullValue' for the search query."); resource.add(new Field("realm", Constants.SEARCH_INDEX_NULL, Field.Store.YES, Field.Index.ANALYZED)); } } catch (Exception exe) { if (log.isDebugEnabled()) log.debug("realms could not be loaded for vcId: " + vcl.getViewComponentId(), exe); } InputStream inputStream=new FileInputStream(file); HTMLParser parser = new HTMLParser(inputStream); return parseHtml(resource, parser); } private String getRealm4Vc(ViewComponentHbm vc) { if (vc == null) return null; Realm2viewComponentHbm realm2vcHmb = realm2VcHbmDao.findByViewComponent(vc.getViewComponentId()); if (realm2vcHmb == null) { return getRealm4Vc(vc.getParent()); } RealmJdbcHbm jdbc = realm2vcHmb.getJdbcRealm(); if (jdbc != null) return createRealmRoleCombo("JDBC_" + jdbc.getJdbcRealmId(), realm2vcHmb.getRoleNeeded()); RealmSimplePwHbm simple = realm2vcHmb.getSimplePwRealm(); if (simple != null) return createRealmRoleCombo("SIMPLEPW_" + simple.getSimplePwRealmId(), realm2vcHmb.getRoleNeeded()); RealmLdapHbm ldap = realm2vcHmb.getLdapRealm(); if (ldap != null) return createRealmRoleCombo("LDAP_" + ldap.getLdapRealmId(), realm2vcHmb.getRoleNeeded()); RealmJaasHbm jaas = realm2vcHmb.getJaasRealm(); if (jaas != null) return createRealmRoleCombo("JAAS_" + jaas.getJaasRealmId(), realm2vcHmb.getRoleNeeded()); return null; } private String createRealmRoleCombo(String realm, String rolesNeeded) { //if (log.isDebugEnabled()) log.info("createRealmRoleCombo realm: " + realm + " roles: " + rolesNeeded); if (rolesNeeded == null) return realm; String result = ""; String[] roles = null; roles = rolesNeeded.split(Constants.SAFEGUARD_ROLE_SEPARATOR); for (int i = 0; i < roles.length; i++) { log.info("found: >" + roles[i] + "<"); if (!roles[i].trim().isEmpty()) { result = result + " " + realm + "_" + roles[i].trim(); log.info("result is now: >" + result + "<"); } } return result; } public Document getExternalResource(String url, Reader htmlContent) throws IOException, InterruptedException { Document resource = new Document(); resource.add(new Field("alias", "HtmlSearchValue", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)); resource.add(new Field("url", url, Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("uid", url, Field.Store.YES, Field.Index.ANALYZED)); HTMLParser parser = new HTMLParser(htmlContent); return parseHtml(resource, parser); } public String stripNonValidXMLCharacters(String in) { if (in == null) return ""; String stripped = in.replaceAll("[^\\u0009\\u000a\\u000d\\u0020-\\ud7ff\\e0000-\\ufffd]", "").replaceAll("[&<>]", ""); return stripped; } private Document parseHtml(Document resource, HTMLParser parser) throws IOException, InterruptedException { Reader reader = parser.getReader(); StringWriter sw = new StringWriter(); org.apache.commons.io.IOUtils.copy(reader, sw); String sresult = sw.toString(); if (log.isDebugEnabled()) log.debug("Saving tokenized HTML value into searchengine: " + sresult); resource.add(new Field("contents", stripNonValidXMLCharacters(sresult), Field.Store.YES, Field.Index.ANALYZED)); Properties prop = parser.getMetaTags(); Collection metafields = prop.values(); String metadata = ""; Iterator it = metafields.iterator(); while (it.hasNext()) { StringTokenizer st = new StringTokenizer((String) it.next(), ","); while (st.hasMoreElements()) { String token = st.nextToken().trim(); metadata += token + " "; } } // tidy the metadata metadata = XercesHelper.html2utf8string(metadata); resource.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED)); // Add the summary as a field that is stored and returned with // hit documents for display. resource.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)); // Add the title as a field that it can be searched and that is stored. resource.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); reader.close(); return resource; } }