/* Copyright (2007-2012) Schibsted ASA * This file is part of Possom. * * Possom is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Possom is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Possom. If not, see <http://www.gnu.org/licenses/>. */ package no.sesat.search.mode.command; import no.sesat.search.datamodel.DataModel; import no.sesat.search.datamodel.generic.StringDataObject; import no.sesat.search.http.HTTPClient; import no.sesat.search.mode.config.NewsAggregatorCommandConfig; import no.sesat.search.result.BasicResultList; import no.sesat.search.result.FastSearchResult; import no.sesat.search.result.Modifier; import no.sesat.search.result.ResultItem; import no.sesat.search.result.ResultList; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.opensymphony.oscache.base.NeedsRefreshException; import com.opensymphony.oscache.general.GeneralCacheAdministrator; import java.io.IOException; import java.net.URL; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Search command that will try to get pregenerated clusters from xml files. If the xml file is not available it will * fall back to a search. * * * @version $Id$ */ public final class NewsAggregatorSearchCommand extends NewsClusteringESPFastCommand { private static final Logger LOG = Logger.getLogger(NewsAggregatorSearchCommand.class); private static final String PARAM_CLUSTER_ID = "clusterId"; /** * @param cxt The context to execute in. */ public NewsAggregatorSearchCommand(final Context cxt) { super(cxt); } @Override public ResultList<ResultItem> execute() { final NewsAggregatorCommandConfig config = getSearchConfiguration(); final StringDataObject clusterId = datamodel.getParameters().getValue(PARAM_CLUSTER_ID); final String xmlUrl = getXmlUrlString(datamodel, config); LOG.debug("Loading xml file at: " + xmlUrl); ResultList<ResultItem> searchResult; if (clusterId == null) { searchResult = getPageResult(config, xmlUrl); } else { searchResult = getClusterResult(config, clusterId, xmlUrl); } LOG.debug("Done (+Tried loading xml file at: " + xmlUrl + ")"); return searchResult; } private String getXmlUrlString(final DataModel dataModel, final NewsAggregatorCommandConfig config) { String geographic = "main"; String category = "main"; String[] geographicFields = config.getGeographicFieldArray(); for (String geographicField : geographicFields) { StringDataObject geo = dataModel.getParameters().getValue(geographicField); if (geo != null) { geographic = formatToConvention(geo.getString()); break; } } for (String categoryField : config.getCategoryFieldArray()) { StringDataObject cat = dataModel.getParameters().getValue(categoryField); if (cat != null) { category = formatToConvention(cat.getString()); break; } } StringBuilder sb = new StringBuilder(config.getXmlSource()); sb.append("fp_"); sb.append(category).append('_').append(geographic).append(".xml"); return sb.toString(); } private String formatToConvention(String replaceString) { String newString = StringUtils.replaceChars(replaceString.toLowerCase(), "\u00E6", "ae"); newString = StringUtils.replaceChars(newString, '\u00F8', 'o'); newString = StringUtils.replaceChars(newString, '\u00E5', 'a'); newString = StringUtils.replaceChars(newString, "\u00E4", "ae"); newString = StringUtils.replaceChars(newString, '\u00F6', 'o'); newString = StringUtils.replaceChars(newString, ' ', '_'); return newString; } private ResultList<ResultItem> getClusterResult( final NewsAggregatorCommandConfig config, final StringDataObject clusterId, final String xmlUrl) { ResultList<ResultItem> searchResult; try { final NewsAggregatorXmlParser newsAggregatorXmlParser = new NewsAggregatorXmlParser(); final StringDataObject sortObject = datamodel.getParameters().getValue(config.getUserSortParameter()); final String sort = sortObject == null ? null : sortObject.getString(); searchResult = newsAggregatorXmlParser.parseCluster(config, xmlUrl, clusterId.getString(), getOffset(), sort); addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "relevance", "descending", "ascending"); if (searchResult != null && searchResult.getHitCount() > 0) { return searchResult; } } catch (IOException e) { LOG.debug("Falling back to search instead of xml parse", e); } catch (SAXException e) { LOG.debug("Falling back to search instead of xml parse", e); } searchResult = search(config, clusterId.getString()); if (searchResult instanceof FastSearchResult) { addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "descending", "ascending"); } return searchResult; } private void addSortModifiers( final FastSearchResult searchResult, final String id, final String... modifierNames) { for (String modifierName : modifierNames) { searchResult.addModifier(id, new Modifier(modifierName, -1, null)); } } private ResultList<ResultItem> search( final NewsAggregatorCommandConfig config, final String clusterId) { LOG.debug("------ Running search to get clusters ---------"); LOG.debug("clusterId=" + clusterId); LOG.debug("result-fields=" + config.getResultFieldMap()); LOG.debug("query-server=" + config.getQueryServer()); LOG.debug("-----------------------------------------------"); return super.execute(); } @Override public NewsAggregatorCommandConfig getSearchConfiguration() { return (NewsAggregatorCommandConfig) super.getSearchConfiguration(); } private ResultList<ResultItem> getPageResult( final NewsAggregatorCommandConfig config, final String xmlUrl) { final NewsAggregatorXmlParser newsAggregatorXmlParser = new NewsAggregatorXmlParser(); ResultList<ResultItem> searchResult; try { searchResult = newsAggregatorXmlParser.parseFullPage(config, getOffset(), xmlUrl); addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "descending", "ascending"); if (searchResult != null && searchResult.getHitCount() > 0) { return searchResult; } } catch (IOException e) { LOG.debug("Falling back to search instead of xml parse", e); } catch (SAXException e) { LOG.debug("Falling back to search instead of xml parse", e); } searchResult = search(config, null); if (searchResult instanceof FastSearchResult) { addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "descending", "ascending"); } return searchResult; } /** * */ @SuppressWarnings({"unchecked"}) public static class NewsAggregatorXmlParser { private static final String ELEMENT_CLUSTER = "cluster"; private static final String ELEMENT_ENTRY_COLLECTION = "entryCollection"; private static final String ATTRIBUTE_FULL_COUNT = "fullcount"; private static final String ATTRIBUTE_CLUSTERID = "id"; private static final String ELEMENT_RELATED = "related"; private static final String ATTRIBUTE_TYPE = "type"; private static final String ELEMENT_CATEGORY = "category"; private static final String ELEMENT_COLLAPSEID = "collapseid"; private static final String ATTRIBUTE_TIMESTAMP = "timestamp"; private static final String ELEMENT_COUNTS = "counts"; private static final String ATTRIBUTE_ENTRY_COUNT = "entries"; private static final String ATTRIBUTE_CLUSTER_COUNT = "clusters"; private static final String ELEMENT_ENTRY = "entry"; private static final GeneralCacheAdministrator CACHE = new GeneralCacheAdministrator(); private static final int REFRESH_PERIOD = 60; // one minute private static final int CACHE_CAPACITY = 64; private static final Logger LOG = Logger.getLogger(NewsAggregatorXmlParser.class); // Static -------------------------------------------------------- static{ CACHE.setCacheCapacity(CACHE_CAPACITY); } /** * Loads an XML document from a URL i.e. file:// or http:// * * @param xmlUrl the url to the xml * @return the parsed XML as an org.w3c.dom.Document object * @throws org.xml.sax.SAXException if the underlying sax parser throws an exception * @throws java.io.IOException if the file could not be read for some reason */ private Document getDocumentFromUrl(String urlString) throws IOException, SAXException { final URL url = new URL(urlString); final HTTPClient httpClient = HTTPClient.instance(url); return httpClient.getXmlDocument(url.getPath()); } /** * Loads an XML document from a URL i.e. file:// or http:// or cache based on a cache timeout * Gets from URL and stores the result in the cache * if a NeedsRefreshException is thrown by the cache * Adapted from TvWaitSearchCommand * * @param xmlUrl the url to the xml * @return the parsed XML as an org.w3c.dom.Document object * @throws org.xml.sax.SAXException if the underlying sax parser throws an exception * @throws java.io.IOException if the file could not be read for some reason */ private final Document getDocumentFromCache(final String urlString) throws IOException, SAXException { Document doc; try { doc = (Document) CACHE.getFromCache(urlString, REFRESH_PERIOD); LOG.debug("Got doc from cache for " + urlString); } catch (NeedsRefreshException e) { boolean updatedCache = false; try{ doc = getDocumentFromUrl(urlString); CACHE.putInCache(urlString, doc); LOG.debug("Got doc from url and added to cache for " + urlString); updatedCache = true; }finally{ if(!updatedCache){ // prevents a deadlock in CACHE! CACHE.cancelUpdate(urlString); } } } return doc; } /** * Parses a specific identified cluster * * @param config the commandConfig * @param xmlUrl the url to the xml containing the cluster * @param clusterId the id of the cluster to parse * @param offset the offset into the cluster where to start returning results * @param sort the sortdirection for the result * @return the parsed result * @throws org.xml.sax.SAXException if the undelying saxpaser throws an exception * @throws java.io.IOException if the file could not be read for some reason */ public FastSearchResult<ResultItem> parseCluster( final NewsAggregatorCommandConfig config, final String xmlUrl, final String clusterId, final int offset, final String sort) throws IOException, SAXException { LOG.debug("Parsing cluster: " + clusterId); // following will either throw a ClassCastException or NPE final FastSearchResult<ResultItem> searchResult = new FastSearchResult<ResultItem>(); final Document doc = getDocumentFromCache(xmlUrl); final Element root = doc.getDocumentElement(); List<Element> clusters = getDirectChildren(root, ELEMENT_CLUSTER); for (Element cluster : clusters) { if (cluster.getAttribute(ATTRIBUTE_CLUSTERID).equals(clusterId)) { handleFlatCluster(config, cluster, searchResult, offset, sort); handleRelated(config, getFirstChild(cluster, ELEMENT_RELATED), searchResult); break; } } return searchResult; } /** * Parses a full summary xml page. * * @param xmlUrl the urel to the page to parse * @param config the commandConfig * @param offset what cluster to start parsing at * @return the result of the parse * @throws org.xml.sax.SAXException if the undelying saxpaser throws an exception * @throws java.io.IOException if the file could not be read for some reason */ public FastSearchResult<ResultItem> parseFullPage( final NewsAggregatorCommandConfig config, final int offset, final String xmlUrl) throws IOException, SAXException { // following will throw a ClassCastException or NPE final FastSearchResult<ResultItem> searchResult = new FastSearchResult<ResultItem>(); final Document doc = getDocumentFromCache(xmlUrl); final Element root = doc.getDocumentElement(); handleClusters(config, offset, getDirectChildren(root, ELEMENT_CLUSTER), searchResult); handleCounts(config, getFirstChild(root, ELEMENT_COUNTS), offset, searchResult); handleRelated(config, getFirstChild(root, ELEMENT_RELATED), searchResult); return searchResult; } private void handleCounts( final NewsAggregatorCommandConfig config, final Element countsElement, final int offset, final FastSearchResult searchResult) { if (countsElement != null) { final String entries = countsElement.getAttribute(ATTRIBUTE_ENTRY_COUNT); if (entries != null && entries.length() > 0) { searchResult.setHitCount(Integer.parseInt(entries)); } final String clusters = countsElement.getAttribute(ATTRIBUTE_CLUSTER_COUNT); if (clusters != null && clusters.length() > 0) { if (offset + config.getResultsToReturn() < Integer.parseInt(clusters)) { addNextOffsetField(offset + config.getResultsToReturn(), searchResult); } } } } private void handleRelated( final NewsAggregatorCommandConfig config, final Element relatedElement, final FastSearchResult searchResult) { if (relatedElement != null) { final List<Element> categoryElements = getDirectChildren(relatedElement, ELEMENT_CATEGORY); for (Element categoryElement : categoryElements) { final String categoryType = categoryElement.getAttribute(ATTRIBUTE_TYPE); final List<Modifier> relatedList = searchResult.getModifiers(categoryType); int categoryCount = 0; if (relatedList != null) { categoryCount = relatedList.size(); } if (categoryCount < config.getRelatedMaxCount()) { final Modifier modifier = new Modifier(categoryElement.getTextContent().trim(), -1, null); searchResult.addModifier(categoryType, modifier); } } } } private void handleClusters( final NewsAggregatorCommandConfig config, final int offset, final List<Element> clusters, final ResultList<ResultItem> searchResult) { final int maxOffset = offset + config.getResultsToReturn(); for (int i = offset; i < clusters.size() && i < maxOffset; i++) { Element cluster = clusters.get(i); handleCluster(config, cluster, searchResult); } } private void handleFlatCluster( final NewsAggregatorCommandConfig config, final Element cluster, final ResultList<ResultItem> searchResult, int offset, final String sort) { if (cluster != null) { final Element entryCollectionElement = getFirstChild(cluster, ELEMENT_ENTRY_COLLECTION); if (entryCollectionElement != null) { final List<Element> entryList = getDirectChildren(entryCollectionElement, ELEMENT_ENTRY); searchResult.setHitCount(entryList.size()); final Map<String, ResultList<ResultItem>> collapseMap = new HashMap<String, ResultList<ResultItem>>(); final ResultList<ResultItem> tmpSearchResult = new BasicResultList<ResultItem>(); // Collecting all results from xml. (This must be done if we want correct collpsing funtionality for (Element entry : entryList) { final ResultList<ResultItem> searchResultItem = new BasicResultList<ResultItem>(); handleEntry(entry, searchResultItem); addResult(config, searchResultItem, tmpSearchResult, collapseMap, true); } sortResults(tmpSearchResult, sort); int lastIndex = Math.min(tmpSearchResult.getResults().size(), offset + config.getResultsToReturn()); for (int i = offset; i < lastIndex; i++) { searchResult.addResult(tmpSearchResult.getResults().get(i)); } if ((offset + config.getResultsToReturn()) < tmpSearchResult.getResults().size()) { addNextOffsetField(offset + config.getResultsToReturn(), searchResult); } } } } private void sortResults(final ResultList<ResultItem> searchResult, final String sort) { if ("ascending".equals(sort)) { searchResult.sortResults(DateFieldSearchResultComparator.getInstance()); } else if ("descending".equals(sort)) { searchResult.sortResults(Collections.reverseOrder(DateFieldSearchResultComparator.getInstance())); } } private int handleCluster( final NewsAggregatorCommandConfig config, final Element cluster, final ResultList<ResultItem> searchResult) { if (cluster != null) { ResultList<ResultItem> clusterResult = new BasicResultList<ResultItem>(); clusterResult = clusterResult.addField( "size", Integer.toString(Integer.parseInt(cluster.getAttribute(ATTRIBUTE_FULL_COUNT)) - 1)); clusterResult = clusterResult.addField( PARAM_CLUSTER_ID, cluster.getAttribute(ATTRIBUTE_CLUSTERID)); final Element entryCollectionElement = getFirstChild(cluster, ELEMENT_ENTRY_COLLECTION); if (entryCollectionElement != null) { final List<Element> entryList = getDirectChildren(entryCollectionElement, ELEMENT_ENTRY); for (int i = 0; i < entryList.size(); i++) { final Element nestedEntry = entryList.get(i); if (i == 0) { // First element is main result clusterResult = (ResultList<ResultItem>) handleEntry(nestedEntry, clusterResult); } else { ResultList<ResultItem> nestedResultItem = new BasicResultList<ResultItem>(); nestedResultItem = (ResultList<ResultItem>) handleEntry(nestedEntry, nestedResultItem); addResult(config, nestedResultItem, clusterResult); } } searchResult.addResult(clusterResult); clusterResult.setHitCount(entryList.size()); return entryList.size(); } } return 0; } private ResultItem handleEntry(final Element entryElement, ResultItem searchResultItem) { final List<Element> entrySubElements = getDirectChildren(entryElement); for (Element entrySubElement : entrySubElements) { if (entrySubElement.getTextContent() != null && entrySubElement.getTextContent().trim().length() > 0) { searchResultItem = searchResultItem.addField(entrySubElement.getNodeName(), entrySubElement.getTextContent().trim()); } } return searchResultItem; } private void addResult( final NewsAggregatorCommandConfig config, final ResultList<ResultItem> srcResult, final ResultList<ResultItem> targetResult) { addResult(config, srcResult, targetResult, null, false); } private boolean addResult(final NewsAggregatorCommandConfig config, final ResultList<ResultItem> srcResult, final ResultList<ResultItem> targetResult, final Map<String, ResultList<ResultItem>> collapseMap, final boolean noMax) { // Check if entry is duplicate and should be a subresult ResultList<ResultItem> collapseParent = null; String collapseId = srcResult.getField(ELEMENT_COLLAPSEID); if (collapseMap != null) { collapseParent = collapseMap.get(collapseId); } if (collapseParent == null) { // Skipping add if max returned results has been reached. if (noMax || targetResult.getResults().size() < config.getResultsToReturn()) { // No duplicate in results or should not be collapsed targetResult.addResult(srcResult); if (collapseMap != null) { collapseMap.put(collapseId, srcResult); } return true; } return false; } else { // duplicate item, adding as a subresult to first item. collapseParent.addResult(srcResult); return true; } } private static Element getFirstChild(Element element, String elementName) { if (element != null) { NodeList childNodes = element.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node childNode = childNodes.item(i); if (childNode instanceof Element && childNode.getNodeName().equals(elementName)) { return (Element) childNode; } } } return null; } private static List<Element> getDirectChildren(Element element, String elementName) { ArrayList<Element> children = new ArrayList<Element>(); if (element != null) { NodeList childNodes = element.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node childNode = childNodes.item(i); if (childNode instanceof Element && childNode.getNodeName().equals(elementName)) { children.add((Element) childNode); } } } return children; } private static List<Element> getDirectChildren(Element element) { ArrayList<Element> children = new ArrayList<Element>(); if (element != null) { NodeList childNodes = element.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node childNode = childNodes.item(i); if (childNode instanceof Element) { children.add((Element) childNode); } } } return children; } } private static final class DateFieldSearchResultComparator implements Comparator<ResultItem> { private final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); private static final String DATE_FIELD_NAME = NewsAggregatorXmlParser.ATTRIBUTE_TIMESTAMP; private static DateFieldSearchResultComparator myInstance = new DateFieldSearchResultComparator(); public static DateFieldSearchResultComparator getInstance() { return myInstance; } private DateFieldSearchResultComparator() { } public int compare(final ResultItem resultItem1, final ResultItem resultItem2) { final String dateField1 = resultItem1.getField(DATE_FIELD_NAME); final String dateField2 = resultItem2.getField(DATE_FIELD_NAME); if (dateField1 == null || dateField1.length() == 0) { if (dateField2 == null || dateField2.length() == 0) { return 0; } else { return -1; } } else { if (dateField2 == null || dateField2.length() == 0) { return 1; } else { try { final Date date1 = sdf.parse(dateField1); final Date date2 = sdf.parse(dateField2); if (date1.before(date2)) { return -1; } else if (date1.after(date2)) { return 1; } } catch (ParseException e) { LOG.error("Could not parse date field, sort will not work.", e); } return 0; } } } } }