/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.lucene; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.Query; import org.apache.lucene.search.highlight.*; import org.carrot2.core.attribute.*; import org.carrot2.util.ExceptionUtils; import org.carrot2.util.attribute.*; import org.carrot2.util.attribute.constraint.ImplementingClasses; import org.carrot2.util.attribute.constraint.IntRange; /** * A simple {@link IFieldMapper} with one-to-one mapping between the default title, url * and summary fields. */ @Bindable public class SimpleFieldMapper implements IFieldMapper { /** {@link Group} name. */ final static String INDEX_FIELD_MAPPING = "Index field mapping"; /** {@link Group} name. */ final static String HIGHLIGHTER = "Highlighter"; /** * Document title field name. */ @Input @Attribute @Init @Processing @Internal(configuration = true) @Label("Document title field") @Level(AttributeLevel.BASIC) @Group(INDEX_FIELD_MAPPING) public String titleField; /** * Document content field name. */ @Input @Attribute @Init @Processing @Internal(configuration = true) @Label("Document content field") @Level(AttributeLevel.BASIC) @Group(INDEX_FIELD_MAPPING) public String contentField; /** * Document URL field name. */ @Input @Attribute @Init @Processing @Internal(configuration = true) @Label("Document URL field") @Level(AttributeLevel.BASIC) @Group(INDEX_FIELD_MAPPING) public String urlField; /** * Index search field names. If not specified, title and content fields are used. */ @Input @Attribute @Init @Processing @Internal(configuration = true) @Label("Search fields") @Level(AttributeLevel.MEDIUM) @Group(INDEX_FIELD_MAPPING) public List<String> searchFields; /** * Snippet formatter for the highlighter. Highlighter is not used if <code>null</code>. */ @Input @Attribute @Init @Processing @ImplementingClasses(classes = { PlainTextFormatter.class, SimpleHTMLFormatter.class }, strict = false) @Label("Formatter") @Level(AttributeLevel.ADVANCED) @Group(HIGHLIGHTER) public Formatter formatter = new PlainTextFormatter(); /** * Number of context fragments for the highlighter. */ @Input @Attribute @Init @Processing @IntRange(min = 1) @Label("Context fragments") @Level(AttributeLevel.ADVANCED) @Group(HIGHLIGHTER) public int contextFragments = 3; /** * A string used to join context fragments when highlighting. */ @Input @Attribute @Init @Processing @Label("Join string") @Level(AttributeLevel.ADVANCED) @Group(HIGHLIGHTER) public String fragmentJoin = "..."; /** * Last initialized highlighter. */ private Highlighter highlighter; /** * Last received {@link Query} object in * {@link #map(Query, Analyzer, Document, org.carrot2.core.Document)}. */ private Query query; /* * */ public String [] getSearchFields() { if (searchFields == null || searchFields.size() == 0) { ArrayList<String> fields = new ArrayList<String>(); if (!StringUtils.isEmpty(titleField)) { fields.add(titleField); } if (!StringUtils.isEmpty(contentField)) { fields.add(contentField); } return fields.toArray(new String [fields.size()]); } return searchFields.toArray(new String [searchFields.size()]); } /* * */ public void map(Query luceneQuery, Analyzer analyzer, Document luceneDoc, org.carrot2.core.Document doc) { if (luceneQuery != query) { this.query = luceneQuery; resetHighlighter(); } /* * Map title and url */ String value = fieldValue(titleField, luceneDoc); if (value != null) { doc.setField(org.carrot2.core.Document.TITLE, value); } value = fieldValue(urlField, luceneDoc); if (value != null) { doc.setField(org.carrot2.core.Document.CONTENT_URL, value); } /* * Map content field. */ value = fieldValue(contentField, luceneDoc); if (value != null) { try { final String summary; if (this.highlighter != null) { final String [] fragments = highlighter.getBestFragments(analyzer, contentField, value, contextFragments); if (fragments.length > 0) { summary = StringUtils.join(fragments, fragmentJoin); } else { summary = value; } } else { summary = value; } doc.setField(org.carrot2.core.Document.SUMMARY, summary); } catch (IOException e) { throw ExceptionUtils.wrapAsRuntimeException(e); } catch (InvalidTokenOffsetsException e) { throw ExceptionUtils.wrapAsRuntimeException(e); } } } /* * */ private String fieldValue(String fieldName, Document doc) { if (StringUtils.isEmpty(fieldName)) { return null; } StringBuilder builder = null; for (IndexableField field : doc.getFields()) { if (field.name().equals(fieldName)) { if (builder == null) builder = new StringBuilder(); if (builder.length() > 0) builder.append(" "); builder.append(field.stringValue()); } } return builder == null ? null : builder.toString(); } /* * */ private void resetHighlighter() { if (formatter != null) { this.highlighter = new Highlighter(formatter, new QueryScorer(query)); this.highlighter.setEncoder(new DefaultEncoder()); } else { this.highlighter = null; } } }