/* Copyright (2007-2012) Schibsted ASA
* This file is part of Possom.
*
* Possom is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Possom is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Possom. If not, see <http://www.gnu.org/licenses/>.
*/
package no.sesat.search.mode.command;
import no.sesat.search.datamodel.DataModel;
import no.sesat.search.datamodel.generic.StringDataObject;
import no.sesat.search.http.HTTPClient;
import no.sesat.search.mode.config.NewsAggregatorCommandConfig;
import no.sesat.search.result.BasicResultList;
import no.sesat.search.result.FastSearchResult;
import no.sesat.search.result.Modifier;
import no.sesat.search.result.ResultItem;
import no.sesat.search.result.ResultList;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.opensymphony.oscache.base.NeedsRefreshException;
import com.opensymphony.oscache.general.GeneralCacheAdministrator;
import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Search command that will try to get pregenerated clusters from xml files. If the xml file is not available it will
* fall back to a search.
*
*
* @version $Id$
*/
public final class NewsAggregatorSearchCommand extends NewsClusteringESPFastCommand {
private static final Logger LOG = Logger.getLogger(NewsAggregatorSearchCommand.class);
private static final String PARAM_CLUSTER_ID = "clusterId";
/**
* @param cxt The context to execute in.
*/
public NewsAggregatorSearchCommand(final Context cxt) {
super(cxt);
}
@Override
public ResultList<ResultItem> execute() {
final NewsAggregatorCommandConfig config = getSearchConfiguration();
final StringDataObject clusterId = datamodel.getParameters().getValue(PARAM_CLUSTER_ID);
final String xmlUrl = getXmlUrlString(datamodel, config);
LOG.debug("Loading xml file at: " + xmlUrl);
ResultList<ResultItem> searchResult;
if (clusterId == null) {
searchResult = getPageResult(config, xmlUrl);
} else {
searchResult = getClusterResult(config, clusterId, xmlUrl);
}
LOG.debug("Done (+Tried loading xml file at: " + xmlUrl + ")");
return searchResult;
}
private String getXmlUrlString(final DataModel dataModel, final NewsAggregatorCommandConfig config) {
String geographic = "main";
String category = "main";
String[] geographicFields = config.getGeographicFieldArray();
for (String geographicField : geographicFields) {
StringDataObject geo = dataModel.getParameters().getValue(geographicField);
if (geo != null) {
geographic = formatToConvention(geo.getString());
break;
}
}
for (String categoryField : config.getCategoryFieldArray()) {
StringDataObject cat = dataModel.getParameters().getValue(categoryField);
if (cat != null) {
category = formatToConvention(cat.getString());
break;
}
}
StringBuilder sb = new StringBuilder(config.getXmlSource());
sb.append("fp_");
sb.append(category).append('_').append(geographic).append(".xml");
return sb.toString();
}
private String formatToConvention(String replaceString) {
String newString = StringUtils.replaceChars(replaceString.toLowerCase(), "\u00E6", "ae");
newString = StringUtils.replaceChars(newString, '\u00F8', 'o');
newString = StringUtils.replaceChars(newString, '\u00E5', 'a');
newString = StringUtils.replaceChars(newString, "\u00E4", "ae");
newString = StringUtils.replaceChars(newString, '\u00F6', 'o');
newString = StringUtils.replaceChars(newString, ' ', '_');
return newString;
}
private ResultList<ResultItem> getClusterResult(
final NewsAggregatorCommandConfig config,
final StringDataObject clusterId,
final String xmlUrl) {
ResultList<ResultItem> searchResult;
try {
final NewsAggregatorXmlParser newsAggregatorXmlParser = new NewsAggregatorXmlParser();
final StringDataObject sortObject = datamodel.getParameters().getValue(config.getUserSortParameter());
final String sort = sortObject == null ? null : sortObject.getString();
searchResult = newsAggregatorXmlParser.parseCluster(config, xmlUrl, clusterId.getString(), getOffset(), sort);
addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "relevance", "descending", "ascending");
if (searchResult != null && searchResult.getHitCount() > 0) {
return searchResult;
}
} catch (IOException e) {
LOG.debug("Falling back to search instead of xml parse", e);
} catch (SAXException e) {
LOG.debug("Falling back to search instead of xml parse", e);
}
searchResult = search(config, clusterId.getString());
if (searchResult instanceof FastSearchResult) {
addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "descending", "ascending");
}
return searchResult;
}
private void addSortModifiers(
final FastSearchResult searchResult,
final String id,
final String... modifierNames) {
for (String modifierName : modifierNames) {
searchResult.addModifier(id, new Modifier(modifierName, -1, null));
}
}
private ResultList<ResultItem> search(
final NewsAggregatorCommandConfig config,
final String clusterId) {
LOG.debug("------ Running search to get clusters ---------");
LOG.debug("clusterId=" + clusterId);
LOG.debug("result-fields=" + config.getResultFieldMap());
LOG.debug("query-server=" + config.getQueryServer());
LOG.debug("-----------------------------------------------");
return super.execute();
}
@Override
public NewsAggregatorCommandConfig getSearchConfiguration() {
return (NewsAggregatorCommandConfig) super.getSearchConfiguration();
}
private ResultList<ResultItem> getPageResult(
final NewsAggregatorCommandConfig config,
final String xmlUrl) {
final NewsAggregatorXmlParser newsAggregatorXmlParser = new NewsAggregatorXmlParser();
ResultList<ResultItem> searchResult;
try {
searchResult = newsAggregatorXmlParser.parseFullPage(config, getOffset(), xmlUrl);
addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "descending", "ascending");
if (searchResult != null && searchResult.getHitCount() > 0) {
return searchResult;
}
} catch (IOException e) {
LOG.debug("Falling back to search instead of xml parse", e);
} catch (SAXException e) {
LOG.debug("Falling back to search instead of xml parse", e);
}
searchResult = search(config, null);
if (searchResult instanceof FastSearchResult) {
addSortModifiers((FastSearchResult) searchResult, config.getUserSortParameter(), "descending", "ascending");
}
return searchResult;
}
/**
*
*/
@SuppressWarnings({"unchecked"})
public static class NewsAggregatorXmlParser {
private static final String ELEMENT_CLUSTER = "cluster";
private static final String ELEMENT_ENTRY_COLLECTION = "entryCollection";
private static final String ATTRIBUTE_FULL_COUNT = "fullcount";
private static final String ATTRIBUTE_CLUSTERID = "id";
private static final String ELEMENT_RELATED = "related";
private static final String ATTRIBUTE_TYPE = "type";
private static final String ELEMENT_CATEGORY = "category";
private static final String ELEMENT_COLLAPSEID = "collapseid";
private static final String ATTRIBUTE_TIMESTAMP = "timestamp";
private static final String ELEMENT_COUNTS = "counts";
private static final String ATTRIBUTE_ENTRY_COUNT = "entries";
private static final String ATTRIBUTE_CLUSTER_COUNT = "clusters";
private static final String ELEMENT_ENTRY = "entry";
private static final GeneralCacheAdministrator CACHE = new GeneralCacheAdministrator();
private static final int REFRESH_PERIOD = 60; // one minute
private static final int CACHE_CAPACITY = 64;
private static final Logger LOG = Logger.getLogger(NewsAggregatorXmlParser.class);
// Static --------------------------------------------------------
static{
CACHE.setCacheCapacity(CACHE_CAPACITY);
}
/**
* Loads an XML document from a URL i.e. file:// or http://
*
* @param xmlUrl the url to the xml
* @return the parsed XML as an org.w3c.dom.Document object
* @throws org.xml.sax.SAXException if the underlying sax parser throws an exception
* @throws java.io.IOException if the file could not be read for some reason
*/
private Document getDocumentFromUrl(String urlString) throws IOException, SAXException {
final URL url = new URL(urlString);
final HTTPClient httpClient = HTTPClient.instance(url);
return httpClient.getXmlDocument(url.getPath());
}
/**
* Loads an XML document from a URL i.e. file:// or http:// or cache based on a cache timeout
* Gets from URL and stores the result in the cache
* if a NeedsRefreshException is thrown by the cache
* Adapted from TvWaitSearchCommand
*
* @param xmlUrl the url to the xml
* @return the parsed XML as an org.w3c.dom.Document object
* @throws org.xml.sax.SAXException if the underlying sax parser throws an exception
* @throws java.io.IOException if the file could not be read for some reason
*/
private final Document getDocumentFromCache(final String urlString) throws IOException, SAXException {
Document doc;
try {
doc = (Document) CACHE.getFromCache(urlString, REFRESH_PERIOD);
LOG.debug("Got doc from cache for " + urlString);
} catch (NeedsRefreshException e) {
boolean updatedCache = false;
try{
doc = getDocumentFromUrl(urlString);
CACHE.putInCache(urlString, doc);
LOG.debug("Got doc from url and added to cache for " + urlString);
updatedCache = true;
}finally{
if(!updatedCache){
// prevents a deadlock in CACHE!
CACHE.cancelUpdate(urlString);
}
}
}
return doc;
}
/**
* Parses a specific identified cluster
*
* @param config the commandConfig
* @param xmlUrl the url to the xml containing the cluster
* @param clusterId the id of the cluster to parse
* @param offset the offset into the cluster where to start returning results
* @param sort the sortdirection for the result
* @return the parsed result
* @throws org.xml.sax.SAXException if the undelying saxpaser throws an exception
* @throws java.io.IOException if the file could not be read for some reason
*/
public FastSearchResult<ResultItem> parseCluster(
final NewsAggregatorCommandConfig config,
final String xmlUrl,
final String clusterId,
final int offset,
final String sort) throws IOException, SAXException {
LOG.debug("Parsing cluster: " + clusterId);
// following will either throw a ClassCastException or NPE
final FastSearchResult<ResultItem> searchResult = new FastSearchResult<ResultItem>();
final Document doc = getDocumentFromCache(xmlUrl);
final Element root = doc.getDocumentElement();
List<Element> clusters = getDirectChildren(root, ELEMENT_CLUSTER);
for (Element cluster : clusters) {
if (cluster.getAttribute(ATTRIBUTE_CLUSTERID).equals(clusterId)) {
handleFlatCluster(config, cluster, searchResult, offset, sort);
handleRelated(config, getFirstChild(cluster, ELEMENT_RELATED), searchResult);
break;
}
}
return searchResult;
}
/**
* Parses a full summary xml page.
*
* @param xmlUrl the urel to the page to parse
* @param config the commandConfig
* @param offset what cluster to start parsing at
* @return the result of the parse
* @throws org.xml.sax.SAXException if the undelying saxpaser throws an exception
* @throws java.io.IOException if the file could not be read for some reason
*/
public FastSearchResult<ResultItem> parseFullPage(
final NewsAggregatorCommandConfig config,
final int offset,
final String xmlUrl) throws IOException, SAXException {
// following will throw a ClassCastException or NPE
final FastSearchResult<ResultItem> searchResult = new FastSearchResult<ResultItem>();
final Document doc = getDocumentFromCache(xmlUrl);
final Element root = doc.getDocumentElement();
handleClusters(config, offset, getDirectChildren(root, ELEMENT_CLUSTER), searchResult);
handleCounts(config, getFirstChild(root, ELEMENT_COUNTS), offset, searchResult);
handleRelated(config, getFirstChild(root, ELEMENT_RELATED), searchResult);
return searchResult;
}
private void handleCounts(
final NewsAggregatorCommandConfig config,
final Element countsElement,
final int offset,
final FastSearchResult searchResult) {
if (countsElement != null) {
final String entries = countsElement.getAttribute(ATTRIBUTE_ENTRY_COUNT);
if (entries != null && entries.length() > 0) {
searchResult.setHitCount(Integer.parseInt(entries));
}
final String clusters = countsElement.getAttribute(ATTRIBUTE_CLUSTER_COUNT);
if (clusters != null && clusters.length() > 0) {
if (offset + config.getResultsToReturn() < Integer.parseInt(clusters)) {
addNextOffsetField(offset + config.getResultsToReturn(), searchResult);
}
}
}
}
private void handleRelated(
final NewsAggregatorCommandConfig config,
final Element relatedElement,
final FastSearchResult searchResult) {
if (relatedElement != null) {
final List<Element> categoryElements = getDirectChildren(relatedElement, ELEMENT_CATEGORY);
for (Element categoryElement : categoryElements) {
final String categoryType = categoryElement.getAttribute(ATTRIBUTE_TYPE);
final List<Modifier> relatedList = searchResult.getModifiers(categoryType);
int categoryCount = 0;
if (relatedList != null) {
categoryCount = relatedList.size();
}
if (categoryCount < config.getRelatedMaxCount()) {
final Modifier modifier = new Modifier(categoryElement.getTextContent().trim(), -1, null);
searchResult.addModifier(categoryType, modifier);
}
}
}
}
private void handleClusters(
final NewsAggregatorCommandConfig config,
final int offset,
final List<Element> clusters,
final ResultList<ResultItem> searchResult) {
final int maxOffset = offset + config.getResultsToReturn();
for (int i = offset; i < clusters.size() && i < maxOffset; i++) {
Element cluster = clusters.get(i);
handleCluster(config, cluster, searchResult);
}
}
private void handleFlatCluster(
final NewsAggregatorCommandConfig config,
final Element cluster,
final ResultList<ResultItem> searchResult,
int offset,
final String sort) {
if (cluster != null) {
final Element entryCollectionElement = getFirstChild(cluster, ELEMENT_ENTRY_COLLECTION);
if (entryCollectionElement != null) {
final List<Element> entryList = getDirectChildren(entryCollectionElement, ELEMENT_ENTRY);
searchResult.setHitCount(entryList.size());
final Map<String, ResultList<ResultItem>> collapseMap
= new HashMap<String, ResultList<ResultItem>>();
final ResultList<ResultItem> tmpSearchResult = new BasicResultList<ResultItem>();
// Collecting all results from xml. (This must be done if we want correct collpsing funtionality
for (Element entry : entryList) {
final ResultList<ResultItem> searchResultItem = new BasicResultList<ResultItem>();
handleEntry(entry, searchResultItem);
addResult(config, searchResultItem, tmpSearchResult, collapseMap, true);
}
sortResults(tmpSearchResult, sort);
int lastIndex = Math.min(tmpSearchResult.getResults().size(), offset + config.getResultsToReturn());
for (int i = offset; i < lastIndex; i++) {
searchResult.addResult(tmpSearchResult.getResults().get(i));
}
if ((offset + config.getResultsToReturn()) < tmpSearchResult.getResults().size()) {
addNextOffsetField(offset + config.getResultsToReturn(), searchResult);
}
}
}
}
private void sortResults(final ResultList<ResultItem> searchResult, final String sort) {
if ("ascending".equals(sort)) {
searchResult.sortResults(DateFieldSearchResultComparator.getInstance());
} else if ("descending".equals(sort)) {
searchResult.sortResults(Collections.reverseOrder(DateFieldSearchResultComparator.getInstance()));
}
}
private int handleCluster(
final NewsAggregatorCommandConfig config,
final Element cluster,
final ResultList<ResultItem> searchResult) {
if (cluster != null) {
ResultList<ResultItem> clusterResult = new BasicResultList<ResultItem>();
clusterResult = clusterResult.addField(
"size",
Integer.toString(Integer.parseInt(cluster.getAttribute(ATTRIBUTE_FULL_COUNT)) - 1));
clusterResult = clusterResult.addField(
PARAM_CLUSTER_ID,
cluster.getAttribute(ATTRIBUTE_CLUSTERID));
final Element entryCollectionElement = getFirstChild(cluster, ELEMENT_ENTRY_COLLECTION);
if (entryCollectionElement != null) {
final List<Element> entryList = getDirectChildren(entryCollectionElement, ELEMENT_ENTRY);
for (int i = 0; i < entryList.size(); i++) {
final Element nestedEntry = entryList.get(i);
if (i == 0) {
// First element is main result
clusterResult = (ResultList<ResultItem>) handleEntry(nestedEntry, clusterResult);
} else {
ResultList<ResultItem> nestedResultItem = new BasicResultList<ResultItem>();
nestedResultItem = (ResultList<ResultItem>) handleEntry(nestedEntry, nestedResultItem);
addResult(config, nestedResultItem, clusterResult);
}
}
searchResult.addResult(clusterResult);
clusterResult.setHitCount(entryList.size());
return entryList.size();
}
}
return 0;
}
private ResultItem handleEntry(final Element entryElement, ResultItem searchResultItem) {
final List<Element> entrySubElements = getDirectChildren(entryElement);
for (Element entrySubElement : entrySubElements) {
if (entrySubElement.getTextContent() != null && entrySubElement.getTextContent().trim().length() > 0) {
searchResultItem = searchResultItem.addField(entrySubElement.getNodeName(), entrySubElement.getTextContent().trim());
}
}
return searchResultItem;
}
private void addResult(
final NewsAggregatorCommandConfig config,
final ResultList<ResultItem> srcResult,
final ResultList<ResultItem> targetResult) {
addResult(config, srcResult, targetResult, null, false);
}
private boolean addResult(final NewsAggregatorCommandConfig config,
final ResultList<ResultItem> srcResult,
final ResultList<ResultItem> targetResult,
final Map<String, ResultList<ResultItem>> collapseMap,
final boolean noMax) {
// Check if entry is duplicate and should be a subresult
ResultList<ResultItem> collapseParent = null;
String collapseId = srcResult.getField(ELEMENT_COLLAPSEID);
if (collapseMap != null) {
collapseParent = collapseMap.get(collapseId);
}
if (collapseParent == null) {
// Skipping add if max returned results has been reached.
if (noMax || targetResult.getResults().size() < config.getResultsToReturn()) {
// No duplicate in results or should not be collapsed
targetResult.addResult(srcResult);
if (collapseMap != null) {
collapseMap.put(collapseId, srcResult);
}
return true;
}
return false;
} else {
// duplicate item, adding as a subresult to first item.
collapseParent.addResult(srcResult);
return true;
}
}
private static Element getFirstChild(Element element, String elementName) {
if (element != null) {
NodeList childNodes = element.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node childNode = childNodes.item(i);
if (childNode instanceof Element && childNode.getNodeName().equals(elementName)) {
return (Element) childNode;
}
}
}
return null;
}
private static List<Element> getDirectChildren(Element element, String elementName) {
ArrayList<Element> children = new ArrayList<Element>();
if (element != null) {
NodeList childNodes = element.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node childNode = childNodes.item(i);
if (childNode instanceof Element && childNode.getNodeName().equals(elementName)) {
children.add((Element) childNode);
}
}
}
return children;
}
private static List<Element> getDirectChildren(Element element) {
ArrayList<Element> children = new ArrayList<Element>();
if (element != null) {
NodeList childNodes = element.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node childNode = childNodes.item(i);
if (childNode instanceof Element) {
children.add((Element) childNode);
}
}
}
return children;
}
}
private static final class DateFieldSearchResultComparator implements Comparator<ResultItem> {
private final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
private static final String DATE_FIELD_NAME = NewsAggregatorXmlParser.ATTRIBUTE_TIMESTAMP;
private static DateFieldSearchResultComparator myInstance = new DateFieldSearchResultComparator();
public static DateFieldSearchResultComparator getInstance() {
return myInstance;
}
private DateFieldSearchResultComparator() {
}
public int compare(final ResultItem resultItem1, final ResultItem resultItem2) {
final String dateField1 = resultItem1.getField(DATE_FIELD_NAME);
final String dateField2 = resultItem2.getField(DATE_FIELD_NAME);
if (dateField1 == null || dateField1.length() == 0) {
if (dateField2 == null || dateField2.length() == 0) {
return 0;
} else {
return -1;
}
} else {
if (dateField2 == null || dateField2.length() == 0) {
return 1;
} else {
try {
final Date date1 = sdf.parse(dateField1);
final Date date2 = sdf.parse(dateField2);
if (date1.before(date2)) {
return -1;
} else if (date1.after(date2)) {
return 1;
}
} catch (ParseException e) {
LOG.error("Could not parse date field, sort will not work.", e);
}
return 0;
}
}
}
}
}