/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.webapp.filter; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.carrot2.core.Document; import org.carrot2.core.IControllerContext; import org.carrot2.core.ProcessingComponentBase; import org.carrot2.core.ProcessingException; import org.carrot2.core.attribute.AttributeNames; import org.carrot2.core.attribute.Init; import org.carrot2.core.attribute.Internal; import org.carrot2.core.attribute.Processing; import org.carrot2.util.attribute.Attribute; import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Output; import org.carrot2.util.attribute.Required; import org.carrot2.shaded.guava.common.base.Joiner; import org.carrot2.shaded.guava.common.base.Strings; import org.carrot2.shaded.guava.common.collect.Lists; /** * Highlights query words in documents using the <b> HTML tag. Highlighting is * performed on the fields specified in {@link #fields}, the results are saved in fields * with names suffixed by {@link #HIGHLIGHTED_FIELD_NAME_SUFFIX}. */ @Bindable public class QueryWordHighlighter extends ProcessingComponentBase { /** * Suffix appended */ public static final String HIGHLIGHTED_FIELD_NAME_SUFFIX = "-highlight"; /** * Enable or disable query highlighter. */ @Init @Processing @Input @Attribute(key = "QueryWordHighlighter.enabled") public boolean enabled = true; /** * Enable or disable query highlighter. */ @Init @Processing @Input @Attribute(key = "QueryWordHighlighter.maxContentLength") public int maxContentLength = Integer.MAX_VALUE; /** * A regular expression that disables highlighting for certain terms. */ @Init @Input @Attribute(key = "QueryWordHighlighter.dontHighlightPattern") public String dontHighlightPattern; private Pattern dontHighlightPatternCompiled; /** * Query-sanitize pattern (any matches are replaced with an empty string). */ @Init @Input @Attribute(key = "QueryWordHighlighter.querySanitizePattern") public String querySanitizePattern = "[\"'()]"; private Pattern querySanitizePatternCompiled; /** * Query that produced the documents, optional. If query is blank, no processing will * be performed. */ @Processing @Input @Internal @Attribute(key = AttributeNames.QUERY) public String query = null; /** * {@link org.carrot2.core.Document}s to highlight query words in. */ @Processing @Input @Output @Required @Internal @Attribute(key = AttributeNames.DOCUMENTS) public List<Document> documents; /** * Fields of the {@link org.carrot2.core.Document} that should have the query words highlighted. */ @Init @Input @Attribute public Collection<String> fields = Arrays.asList(new String [] { Document.TITLE, Document.SUMMARY }); @Override public void init(IControllerContext context) { super.init(context); if (dontHighlightPattern != null) { dontHighlightPatternCompiled = Pattern.compile(dontHighlightPattern); } if (querySanitizePattern != null) { querySanitizePatternCompiled = Pattern.compile(querySanitizePattern); } } @Override public void process() throws ProcessingException { if (!enabled) { return; } if (query == null) { query = ""; } // Create regexp patterns for each query word final String [] queryTerms = querySanitizePatternCompiled .matcher(query).replaceAll("") .split("\\s+"); Pattern queryPattern = null; List<String> patterns = Lists.newArrayList(); for (String queryTerm : queryTerms) { if (Strings.isNullOrEmpty(queryTerm)) { continue; } if (dontHighlightPatternCompiled != null && dontHighlightPatternCompiled.matcher(queryTerm).matches()) { continue; } patterns.add("(" + Pattern.quote(escapeLtGt(queryTerm)) + ")"); } if (patterns.size() > 0) { queryPattern = Pattern.compile( Joiner.on("|").join(patterns), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } // As we're going to modify documents, we need to copy them to // avoid ConcurrentModificationExceptions. final List<Document> inputDocuments = documents; final List<Document> outputDocuments = Lists .newArrayListWithCapacity(inputDocuments.size()); for (Document document : inputDocuments) { final Document clonedDocument = document.clone(); for (String fieldName : fields) { highlightQueryTerms(clonedDocument, fieldName, queryPattern); } outputDocuments.add(clonedDocument); } documents = outputDocuments; } private void highlightQueryTerms(Document document, String fieldName, Pattern queryPattern) { String field = (String) document.getField(fieldName); if (StringUtils.isBlank(field)) { return; } if (field.length() > maxContentLength) { field = field.substring(0, maxContentLength) + "..."; } field = escapeLtGt(field); if (queryPattern != null) { Matcher matcher = queryPattern.matcher(field); field = matcher.replaceAll("<b>$0</b>"); } document.setField(fieldName + HIGHLIGHTED_FIELD_NAME_SUFFIX, field); } private static final Pattern LT_PATTERN = Pattern.compile("<"); private static final Pattern GT_PATTERN = Pattern.compile(">"); private String escapeLtGt(String field) { field = LT_PATTERN.matcher(field).replaceAll("<"); field = GT_PATTERN.matcher(field).replaceAll(">"); return field; } }