package mj.ocraptor.database.search;
import static mj.ocraptor.configuration.Config.APP_NAME_LOWER;
import static mj.ocraptor.extraction.image_processing.TikaImageHelper.IMAGE_CONTAINER_CLASS;
import java.io.File;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import mj.ocraptor.configuration.Config;
import mj.ocraptor.configuration.Localization;
import mj.ocraptor.configuration.properties.ConfigString;
import mj.ocraptor.console.COF;
import mj.ocraptor.console.ExtendedAscii;
import mj.ocraptor.database.StandardAnalyzer;
import mj.ocraptor.file_handler.utils.FileTools;
import mj.ocraptor.tools.St;
import mj.ocraptor.tools.Tp;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.util.Version;
public class TextProcessing {
// ------------------------------------------------ //
// *INDENT-OFF*
private static final String
FILEINFO_CLASS = "fileInfo",
FILEINFO_TITLE_CLASS = "fileInfoTitle",
FILEINFO_KEY_CLASS = "key",
// ------------------------------------------------ //
METADATA_CLASS = "metadata",
METADATA_TITLE_CLASS = "metadataTitle",
// ------------------------------------------------ //
PAGE_CLASS = "page",
SNIPPET_CLASS = "highlightedSnippet",
// ------------------------------------------------ //
IMAGE_TITLE_CLASS = "imageDataTitle",
// ------------------------------------------------ //
SCRIPT_TAG_OPEN = "<script type=\"text/javascript\" src=\"",
SCRIPT_TAG_CLOSE = "\"></script>",
STYLE_TAG_OPEN = "<link rel=\"stylesheet\" type=\"text/css\" href=\"",
STYLE_TAG_CLOSE = "\"/>",
FAVICON_TAG_OPEN = "<link rel=\"shortcut icon\" type=\"image/x-icon\" href=\"",
FAVICON_TAG_CLOSE = "\"/>",
PAGE_IND_CLASS = "pageIndicator",
XMLNS = "<div xmlns=\"http://www.w3.org/1999/xhtml\">",
SNIPPET_ID_PREFIX = "snippet_",
STYLESHEET = "<head>" +
FAVICON_TAG_OPEN + "{0}/favicon.ico" +
FAVICON_TAG_CLOSE +
SCRIPT_TAG_OPEN + "{0}/jquery_2_1_4_min.js" +
SCRIPT_TAG_CLOSE +
SCRIPT_TAG_OPEN + "{0}/jquery_arbitrary_anchor.js" +
SCRIPT_TAG_CLOSE +
SCRIPT_TAG_OPEN + "{0}/jquery_scrollup_min.js" +
SCRIPT_TAG_CLOSE +
SCRIPT_TAG_OPEN + "{0}/" + APP_NAME_LOWER + ".js" +
SCRIPT_TAG_CLOSE +
STYLE_TAG_OPEN + "{0}/" + APP_NAME_LOWER + ".css" +
STYLE_TAG_CLOSE +
"<script type=\"text/javascript\">" +
"jQuery(document).ready(function() '{'" +
"jQuery(\"#scrollUp\").text(\"{1}\");" +
"'}');" +
"</script>" +
"</head>",
HTML_EXTENSION = ".html",
HTML_BREAKLINE = "<br/>",
METADATA_START = "<div class=\"" + METADATA_CLASS + "\">";
// *INDENT-ON*
// ------------------------------------------------ //
// *INDENT-OFF*
public final static String
SNIPPET_SYMBOL = ExtendedAscii.getAsciiAsString(174) + " ",
PAGE_INDICATOR_OPEN = "<div class=\"" + PAGE_CLASS + "\"",
PAGE_INDICATOR_CLOSED = PAGE_INDICATOR_OPEN + ">",
PAGE_INDICATOR_IMAGE_OPEN = "<div class=\"" + IMAGE_CONTAINER_CLASS + "\"",
PAGE_INDICATOR_IMAGE_CLOSED = PAGE_INDICATOR_IMAGE_OPEN + ">",
PAGE_INDICATOR_META_OPEN = "<div class=\"" + METADATA_CLASS + "\"",
PAGE_INDICATOR_META_CLOSED = PAGE_INDICATOR_META_OPEN + ">",
PAGE_INDICATOR_CUSTOM = "<span page=\"",
PAGE_ID = " id=\"page_",
PAGE_INDICATOR_OPEN_WITH_ID = PAGE_INDICATOR_OPEN + PAGE_ID,
PAGE_INDICATOR_META_OPEN_WITH_ID = PAGE_INDICATOR_META_OPEN + PAGE_ID,
PAGE_INDICATOR_IMAGE_OPEN_WITH_ID = PAGE_INDICATOR_IMAGE_OPEN + PAGE_ID,
OCR_IMAGE_BREAKLINE = "|",
PAGE_MARK_STRIPPED = St.generatePassword(130, 32),
PAGE_MARK_CLOSED = "<page>" + PAGE_MARK_STRIPPED + "</page>";
// *INDENT-ON*
private static final int MIN_SEARCH_HIT_DISTANCE = 200;
private static final int XHTML_SCROLL_TIME_IN_MS = 1000;
private static Analyzer luceneAnalyzer;
private static QueryParser queryParser;
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
/**
*
*/
static {
// TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_36, new
// StringReader(string));
List<String> stopWordsList = null;
final String stopWordsString = Config.inst().getProp(ConfigString.STOP_WORDS);
if (stopWordsString != null && !stopWordsString.trim().isEmpty()) {
stopWordsList = new ArrayList<String>();
for (String stopWord : stopWordsString.split(";")) {
stopWord = stopWord.trim();
if (!stopWord.isEmpty() && !stopWordsList.contains(stopWord)) {
stopWordsList.add(stopWord);
}
}
}
// CharArraySet stopSet = CharArraySet.copy(Version.LUCENE_30,
// StandardAnalyzer.STOP_WORDS_SET);
CharArraySet stopSet = CharArraySet.copy(Version.LUCENE_30, new HashSet<String>(stopWordsList));
final Version luceneVersion = Version.LUCENE_30;
luceneAnalyzer = new StandardAnalyzer(luceneVersion, stopSet);
queryParser = new QueryParser(luceneVersion, "_DATA", luceneAnalyzer);
queryParser.setAllowLeadingWildcard(true);
}
/**
*
*
* @param xml
* @return
*/
public static String preProcess(String xml) {
// *INDENT-OFF*
xml = xml.
// ms excel fix, generates to many rows
replace("<tr> </tr>", "").
replace("<tr></tr>", "").
replace("<tr/> <tr/>", "").
replace("<tr/><tr/>", "").
// remove some some special characters
replace("\ufffd", "").
replace("\u25a0", "").
replace("\u2022", "").
// remove multiple punctuation
replaceAll("(\\s?(\\.)\\s?)+", ".").
replaceAll("(\\s?(\\,)\\s?)+", ",").
replaceAll("\\|+", "|")
;
xml = St.stripUrlTags(xml);
xml = St.normalizeDocumentText(xml);
// *INDENT-ON*
return xml;
}
/**
*
*
* @param xml
* @return
*/
public static String postProcess(String xml) {
// TODO: performance testing
// TODO: multiple postProcess call for one single file
xml = St.stripHtmlTags(xml);
xml = St.replaceLineBreaks(xml);
xml = xml.replaceAll("\\s+", " ");
return xml;
}
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
/**
*
*
* @param text
* @return
*/
public static String encodePagePositions(final String text) {
String encodedXml = null;
try {
encodedXml = encodePagePositions(text, PAGE_INDICATOR_OPEN, PAGE_INDICATOR_CLOSED);
encodedXml = encodePagePositions(encodedXml, PAGE_INDICATOR_IMAGE_OPEN,
PAGE_INDICATOR_IMAGE_CLOSED);
encodedXml = encodePagePositions(encodedXml, PAGE_INDICATOR_META_OPEN,
PAGE_INDICATOR_META_CLOSED);
} catch (Exception e) {
e.printStackTrace();
}
return encodedXml;
}
/**
*
*
* @return
*/
private static String encodePagePositions(final String text, final String openIndicator,
final String closedIndicator) throws Exception {
// ------------------------------------------------ //
int index = -1;
final ArrayList<Integer> pageIndicatorIndex = new ArrayList<Integer>();
while (!Thread.currentThread().isInterrupted()) {
index = text.indexOf(closedIndicator, index + 1);
if (index < 0) {
break;
}
pageIndicatorIndex.add(index);
}
StringBuilder textWithTempPageMarker = new StringBuilder(text);
for (int i = pageIndicatorIndex.size() - 1; i >= 0; i--) {
Integer ind = pageIndicatorIndex.get(i) + closedIndicator.length();
textWithTempPageMarker.insert(ind, PAGE_MARK_CLOSED);
}
// ------------------------------------------------ //
final String postProcessedText = postProcess(textWithTempPageMarker.toString());
final ArrayList<Integer> pageTempMarkerPosition = new ArrayList<Integer>();
index = -1;
while (!Thread.currentThread().isInterrupted()) {
index = postProcessedText.indexOf(PAGE_MARK_STRIPPED, index + 1);
if (index < 0) {
break;
}
pageTempMarkerPosition.add(index);
}
// ------------------------------------------------ //
final ArrayList<Integer> actualPagePositions = new ArrayList<Integer>();
for (int i = pageTempMarkerPosition.size() - 1; i >= 0; i--) {
Integer ind = pageTempMarkerPosition.get(i);
int newIndex = ind - (i * PAGE_MARK_STRIPPED.length());
actualPagePositions.add(newIndex);
}
// ------------------------------------------------ //
String finalOutput = text;
final Pattern pattern = Pattern.compile(PAGE_INDICATOR_CUSTOM + "(.+?)\"");
for (int i = pageIndicatorIndex.size() - 1; i >= 0; i--) {
int pageCount = (i + 1);
if (openIndicator.equals(PAGE_INDICATOR_IMAGE_OPEN)) {
final Matcher matcher = pattern.matcher(text.substring(pageIndicatorIndex.get(i)));
if (matcher.find()) {
final String page[] = matcher.group(1).split(":");
if (page.length > 0) {
try {
pageCount = Integer.parseInt(page[0]);
} catch (NumberFormatException e) {
}
}
}
}
final String insert = PAGE_ID.concat(
String.valueOf(actualPagePositions.get(pageIndicatorIndex.size() - (i + 1)))).concat(":")
.concat(String.valueOf(pageCount)).concat("\"");
final String part1 = StringUtils.substring(finalOutput, 0,
pageIndicatorIndex.get(i) + openIndicator.length()).concat(insert);
finalOutput = part1.concat(StringUtils.substring(finalOutput, pageIndicatorIndex.get(i)
+ openIndicator.length()));
}
// ------------------------------------------------ //
// prettyPrint(finalOutput);
return finalOutput;
}
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
/**
* @return the luceneAnalyzer
*/
public static Analyzer getLuceneAnalyzer() {
return luceneAnalyzer;
}
/**
* @return the queryParser
*/
public static QueryParser getQueryParser() {
return queryParser;
}
/**
* Last positions marks the metadata. e.g.: {0=1, 40=2, 3963=3, 7154=1} index
* of metadata: '7154'
*
* @param xml
* @return
*/
public static SortedMap<Integer, PartialEntry> decodePagePositions(final String xml) {
final SortedMap<Integer, PartialEntry> decodedPositions = decodePagePositions(xml,
PartialEntryType.TEXTDATA);
decodedPositions.putAll(decodePagePositions(xml, PartialEntryType.METADATA));
decodedPositions.putAll(decodePagePositions(xml, PartialEntryType.IMAGEDATA));
// System.out.println(decodedPositions);
return decodedPositions;
}
/**
*
*
* @param xml
* @param pageTag
* @return
*/
public static SortedMap<Integer, PartialEntry> decodePagePositions(final String xml,
final PartialEntryType type) {
String pageTag = null;
final SortedMap<Integer, PartialEntry> pagePositions = new TreeMap<Integer, PartialEntry>();
if (type == PartialEntryType.TEXTDATA) {
pageTag = PAGE_INDICATOR_OPEN_WITH_ID;
pagePositions.put(0, new PartialEntry(1, type));
} else if (type == PartialEntryType.METADATA) {
pageTag = PAGE_INDICATOR_META_OPEN_WITH_ID;
} else if (type == PartialEntryType.IMAGEDATA) {
pageTag = PAGE_INDICATOR_IMAGE_OPEN_WITH_ID;
}
final Pattern pattern = Pattern.compile(pageTag + "(.+?)\"");
final Matcher matcher = pattern.matcher(xml);
// aa
while (matcher.find()) {
try {
String pageID = matcher.group(1);
if (pageID != null) {
String[] parts = pageID.split(":");
if (parts.length == 2) {
Integer pagePosition = Integer.parseInt(parts[0]);
Integer pageNumber = Integer.parseInt(parts[1]);
pagePositions.put(pagePosition, new PartialEntry(pageNumber, type));
}
}
} catch (NumberFormatException e) {
} catch (Exception e) {
e.printStackTrace();
}
}
return pagePositions;
}
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
/**
*
*
* @param positionMap
* @param index
* @return
*/
public static Integer getPagePosition(final SortedMap<Integer, PartialEntry> positionMap,
final Integer index) {
int bestPosition = 0;
for (Integer position : positionMap.keySet()) {
if (index >= position) {
bestPosition = Math.max(position, bestPosition);
}
}
return bestPosition;
}
/**
*
*
* @param positionMap
* @param index
* @return
*/
public static Integer getPage(final SortedMap<Integer, PartialEntry> positionMap,
final Integer index) {
final Integer bestPagePosition = getPagePosition(positionMap, index);
PartialEntry partialEntry = positionMap.get(bestPagePosition);
Integer page = null;
if (partialEntry != null) {
page = positionMap.get(bestPagePosition).getPage();
}
if (page == null) {
page = 1;
}
return page;
}
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
/**
*
*
* @param xhtml
* @param fileName
* @return
*/
public static File saveXhtmlToFile(String xhtml, final String contentSearch, final File file) {
final String normalizedFileName = St.normalizeFileName(file.getName());
File tempFile = FileTools.getTempFile(normalizedFileName, HTML_EXTENSION, true);
try {
final String filePath = St.shortenHomePathInDirectory(FileTools.multiplatformPath(file));
final String fileSizeInKB = String.valueOf(file.length() / 1024);
// TODO: Performance problems if enabled
final Query query = queryParser.parse(contentSearch);
final String snippetMarker = "<span class=\"" + SNIPPET_CLASS + "\" id=\""
+ SNIPPET_ID_PREFIX + "";
xhtml = COF.getHighlightedField(query, luceneAnalyzer, "", xhtml, snippetMarker + "\">",
"</span>");
xhtml = St.numberSubstrings(xhtml, snippetMarker, false);
// StringUtils.replace(xhtml, OCR_IMAGE_BREAKLINE, HTML_BREAKLINE);
xhtml = xhtml.replace(OCR_IMAGE_BREAKLINE, HTML_BREAKLINE);
final List<Integer> indexList = St.getSubstringIndexAsList(xhtml, SNIPPET_CLASS);
final StringBuilder anchorList = new StringBuilder();
int lastIndex = 0, pseudoAnchorIndex = 1;
for (int i = 0; i < indexList.size(); i++) {
final Integer currentIndex = indexList.get(i);
if (lastIndex == 0 || currentIndex - lastIndex > MIN_SEARCH_HIT_DISTANCE) {
final Integer anchorIndex = i + 1;
anchorList.append("<a href=\"##" + SNIPPET_ID_PREFIX + "" + anchorIndex + "|"
+ XHTML_SCROLL_TIME_IN_MS + "\">" + pseudoAnchorIndex++ + "</a> ");
}
lastIndex = currentIndex;
}
Localization lc = Localization.instance();
// some basic file info like name and size for our header:
// *INDENT-OFF*
final String fileInfo =
"<p class=\"" + FILEINFO_TITLE_CLASS + "\">" +
lc.getText("SEARCH_RESULT.VIEWED_FILE") +
"</p>" +
"<div class=\"" + FILEINFO_CLASS + "\">" +
"<table>" +
"<tr>" +
"<td class=\"" + FILEINFO_KEY_CLASS + "\">" +
lc.getText("SEARCH_RESULT.PATH") +
"</td>" +
"<td>" +
filePath +
"</td>" +
"</tr>" +
"<tr>" +
"<td class=\"" + FILEINFO_KEY_CLASS + "\">" +
lc.getText("SEARCH_RESULT.SIZE") +
"</td>" +
"<td>" +
lc.getText("SEARCH_RESULT.KB", fileSizeInKB) +
"</td>" +
"</tr>" +
"<tr>" +
"<td class=\"" + FILEINFO_KEY_CLASS + "\">" +
lc.getText("SEARCH_RESULT.QUERY") +
"</td>" +
"<td>" +
contentSearch +
"</td>" +
"</tr>" +
"<tr>" +
"<td class=\"" + FILEINFO_KEY_CLASS + "\">" +
// TODO: text
"Snippets" +
"</td>" +
"<td>" +
anchorList.toString() +
"</td>" +
"</tr>" +
"</table>" +
"</div>";
// *INDENT-ON*
final String filesFolder = Config.inst().getTempFullTextStylesheetFolder().getName();
final String styleSheetPath = MessageFormat.format(STYLESHEET, filesFolder, lc
.getText("SEARCH_RESULT.SCROLLUP"));
xhtml = xhtml.replace(XMLNS, XMLNS + styleSheetPath + fileInfo);
final Pattern pattern = Pattern.compile(PAGE_INDICATOR_CUSTOM + "(.+?)\"");
final Matcher matcher = pattern.matcher(xhtml);
boolean imagesAreMarked = false;
while (matcher.find()) {
final String page[] = matcher.group(1).split(":");
if (page.length > 0) {
Integer parsedPage = null;
try {
parsedPage = Integer.parseInt(page[0]);
} catch (NumberFormatException e) {
}
if (parsedPage != null) {
final String pageString = lc.getText("SEARCH_RESULT.IMAGES_ON_PAGE", parsedPage);
// System.out.println(parsedPage + " -- " + pageString);
if (!xhtml.contains(pageString)) {
final String indicator = "<div class=\"" + IMAGE_CONTAINER_CLASS + "\">"
+ matcher.group(0);
xhtml = xhtml.replaceFirst(indicator, "<p class=\"" + PAGE_IND_CLASS + "\">"
+ pageString + "</p>" + indicator);
imagesAreMarked = true;
}
}
}
}
xhtml = xhtml.replace(METADATA_START, "<p class=\"" + METADATA_TITLE_CLASS + "\">"
+ lc.getText("SEARCH_RESULT.METADATA") + "</p>" + METADATA_START);
if (!imagesAreMarked) {
final String imageContainer = "<div class=\"" + IMAGE_CONTAINER_CLASS + "\">";
String imageContainerTitle = "<p class=\"" + IMAGE_TITLE_CLASS + "\">"
+ lc.getText("SEARCH_RESULT.IMAGE_FILES") + "</p>" + imageContainer;
xhtml = xhtml.replace(imageContainer, imageContainerTitle);
}
FileTools.stringToFile(xhtml, tempFile);
} catch (IOException e) {
// TODO log
tempFile = null;
e.printStackTrace();
} catch (ParseException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (InvalidTokenOffsetsException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
return tempFile;
}
/**
*
*
* @param text
* @param startString
* @param query
* @param analyzer
* @param maxSnippetLength
* @param maxWidthForOneLine
* @return
*/
public static Tp<String[], Integer[]> prepareHighlight(String text, String startString,
Query query, int maxSnippetLength, int maxHighlights, boolean fulltext) {
Integer firstIndex = null;
List<Integer> keys = null;
List<Integer> firstIndices = new ArrayList<Integer>();
List<String> snippets = new ArrayList<String>();
try {
String modifiedString = COF.getHighlightedField(query, luceneAnalyzer, "", text);
if (modifiedString == null) {
return null;
}
firstIndex = modifiedString.indexOf(Config.SEARCH_DELIMITER_START) - 1;
if (firstIndex < 0) {
firstIndex = 0;
// firstIndex = null;
}
HashMap<Integer, String> tags = COF.getTagValues(modifiedString);
// sort tags by their position
keys = new ArrayList<Integer>(tags.keySet());
class ScoreSorter implements Comparator<Integer> {
@Override
public int compare(Integer a, Integer b) {
return a.compareTo(b);
}
}
Collections.sort(keys, new ScoreSorter());
int lastPrintedIndex = 0;
int i = 0;
List<Integer> tempIndices = new ArrayList<Integer>();
int indices = 0;
for (Integer ind : keys) {
String firstHighlightString = tags.get(ind);
if (ind + firstHighlightString.length() <= lastPrintedIndex)
continue;
if (i >= maxHighlights && maxHighlights <= 5) {
break;
}
String snippetIndicator = "[...]";
String[] temp = St.findSn(modifiedString, firstHighlightString, snippetIndicator, ind,
maxSnippetLength);
String snippet = (fulltext ? ExtendedAscii.getAsciiAsString(174) + " " : "")
+ St.normalizeDocumentText(temp[0] + firstHighlightString + temp[1]);
tempIndices.add(ind);
// TODO: stephen king
if (i >= maxHighlights) {
continue;
}
// ------------------------------------------------ //
snippets.add(snippet);
if (!firstIndices.contains(tempIndices.get(0))) {
firstIndices.add(tempIndices.get(0) - indices);
indices += Config.SEARCH_DELIMITER_START.length()
* St.getSubstringOccurrence(snippet, Config.SEARCH_DELIMITER_START)
+ Config.SEARCH_DELIMITER_END.length()
* St.getSubstringOccurrence(snippet, Config.SEARCH_DELIMITER_END);
}
tempIndices.clear();
lastPrintedIndex = ind + firstHighlightString.length() + temp[1].length();
i++;
}
} catch (Exception e) {
// TODO: logging
e.printStackTrace();
}
// *INDENT-OFF*
return new Tp<String[], Integer[]>(
snippets.toArray (new String[snippets.size()]),
firstIndices.toArray (new Integer[firstIndices.size()])
);
// *INDENT-ON*
}
/**
*
*
* @param fulltext
* @param snippet
* snippet-string with highlight markers
* @param positions
* @param highlightedIndexFulltext
* @param fileName
* @return
*/
// TODO: "there was"-search, no snippets found
public static StyledSnippet highlightString(final String fulltext, String snippet,
final SortedMap<Integer, PartialEntry> positions, Integer highlightedIndexFulltext,
final String fileName) {
final StyledSnippet styledSnippet = new StyledSnippet();
if (snippet != null) {
// ------------------------------------------------ //
// don't color the snippet symbol
if (snippet.startsWith(SNIPPET_SYMBOL)) {
snippet = snippet.replaceFirst(Pattern.quote(SNIPPET_SYMBOL), "");
styledSnippet.add(SNIPPET_SYMBOL, StyledSnippetType.START_INDICATOR);
}
// don't color the trimmed indicator at the start
if (snippet.startsWith(St.TRIMMED_INDICATOR)) {
snippet = snippet.replaceFirst(Pattern.quote(St.TRIMMED_INDICATOR), "");
styledSnippet.add(St.TRIMMED_INDICATOR, StyledSnippetType.TRIMMED_INDICATOR);
}
boolean snippetEndsWithTrimmedIndicator = false;
// don't color the trimmed indicator at the end
if (snippet.endsWith(St.TRIMMED_INDICATOR)) {
snippet = St.replaceLast(snippet, St.TRIMMED_INDICATOR);
snippetEndsWithTrimmedIndicator = true;
}
// ------------------------------------------------ //
final HashMap<Integer, String> tags = COF.getTagValues(snippet);
// sort tags by their position
List<Integer> keys = new ArrayList<Integer>(tags.keySet());
class ScoreSorter implements Comparator<Integer> {
@Override
public int compare(Integer a, Integer b) {
return a.compareTo(b);
}
}
Collections.sort(keys, new ScoreSorter());
// ------------------------------------------------ //
Integer metadataIndexFulltext = 0;
for (Integer positionIndex : positions.keySet()) {
PartialEntry entry = positions.get(positionIndex);
if (entry.getType() == PartialEntryType.METADATA) {
metadataIndexFulltext = positionIndex;
}
}
// ------------------------------------------------ //
// *INDENT-OFF*
int loopIndex = 0,
indexAfterLastHighlightMarked = 0,
indexAfterLastHighlightUnmarked = 0,
metadataIndex = 0,
adjustedMetadataIndex = 0;
// *INDENT-ON*
// ------------------------------------------------ //
int snippetStartIndexFulltext = 0;
if (!keys.isEmpty()) {
snippetStartIndexFulltext = Math.abs(highlightedIndexFulltext - keys.get(0));
if (metadataIndexFulltext != null) {
metadataIndex = metadataIndexFulltext - snippetStartIndexFulltext;
metadataIndex = metadataIndex < 0 ? 0 : metadataIndex;
}
}
String cleanedFulltext = St.removeSearchDelimiter(fulltext);
String cleanedSnippet = St.removeSearchDelimiter(snippet);
// System.out.println(cleanedFulltext.substring(metadataIndex));
// ------------------------------------------------ //
// highlightedIndex := index of first highlight in current snippet (with
// markers)
for (Integer highlightedIndex : keys) {
final String highlightedString = tags.get(highlightedIndex);
try {
// ------------------------------------------------ //
String snippetPrefix = St.removeSearchDelimiter(snippet.substring(
indexAfterLastHighlightMarked, highlightedIndex));
adjustedMetadataIndex = metadataIndex - indexAfterLastHighlightUnmarked;
indexAfterLastHighlightUnmarked += snippetPrefix.length()
+ St.removeSearchDelimiter(highlightedString).length();
// ------------------------------------------------ //
if (highlightedIndex >= 0) {
if (snippetPrefix != null && !snippetPrefix.isEmpty()) {
// ------------------------------------------------ //
if (styledSnippet.getLength() >= metadataIndex + 4) {
addLinebreak(styledSnippet);
styledSnippet.add(snippetPrefix, StyledSnippetType.METADATA);
snippetPrefix = null;
} else if (snippetPrefix.length() > adjustedMetadataIndex) {
final String unhighlightedSnippet = snippetPrefix.substring(0,
adjustedMetadataIndex);
styledSnippet.add(unhighlightedSnippet, StyledSnippetType.FULLTEXT);
addLinebreak(styledSnippet);
final String metadataSnippet = TextProcessing.postProcess(snippetPrefix
.substring(adjustedMetadataIndex));
styledSnippet.add(metadataSnippet, StyledSnippetType.METADATA);
snippetPrefix = null;
}
// ------------------------------------------------ //
}
}
// ------------------------------------------------ //
if (snippetPrefix != null && !snippetPrefix.isEmpty()) {
styledSnippet.add(snippetPrefix, StyledSnippetType.FULLTEXT);
}
indexAfterLastHighlightMarked = highlightedIndex + highlightedString.length();
// ------------------------------------------------ //
// snippet to highlight:
String fragmentString = St.removeSearchDelimiter(highlightedString);
styledSnippet.add(fragmentString, StyledSnippetType.HIGHLIGHT);
// ------------------------------------------------ //
// last string
if (++loopIndex == keys.size()) {
String suffix = St.removeSearchDelimiter(snippet.substring(
indexAfterLastHighlightMarked, snippet.length()));
snippetStartIndexFulltext = Math.abs(highlightedIndexFulltext - highlightedIndex);
adjustedMetadataIndex = metadataIndex - indexAfterLastHighlightUnmarked;
// ------------------------------------------------ //
if (highlightedIndex >= 0 && metadataIndex >= 0 && suffix != null
&& !suffix.isEmpty()) {
if (styledSnippet.getLength() >= metadataIndex + 4) {
addLinebreak(styledSnippet);
styledSnippet.add(suffix, StyledSnippetType.METADATA);
suffix = null;
} else if (suffix.length() > adjustedMetadataIndex) {
String unhighlightedSnippet = suffix.substring(0, adjustedMetadataIndex);
styledSnippet.add(unhighlightedSnippet, StyledSnippetType.FULLTEXT);
addLinebreak(styledSnippet);
String metadataSnippet = suffix.substring(adjustedMetadataIndex);
styledSnippet.add(metadataSnippet, StyledSnippetType.METADATA);
suffix = null;
}
}
if (suffix != null) {
styledSnippet.add(suffix, StyledSnippetType.FULLTEXT);
}
}
} catch (Exception e) {
// TODO: log
System.out.println(ExceptionUtils.getStackTrace(e));
}
}
if (snippetEndsWithTrimmedIndicator) {
styledSnippet.add(St.TRIMMED_INDICATOR, StyledSnippetType.TRIMMED_INDICATOR);
}
}
return styledSnippet;
}
/**
*
*
* @param styledSnippet
*/
public static void addLinebreak(StyledSnippet styledSnippet) {
boolean metadataStarted = false, onlyConsistsOfMetadata = true;
final List<Tp<String, StyledSnippetType>> snippets = styledSnippet.getSnippets();
for (int i = snippets.size() - 1; i >= 0; i--) {
if (snippets.get(i).getValue() == StyledSnippetType.FULLTEXT) {
String fulltextSnippet = snippets.get(i).getKey();
if (!fulltextSnippet.trim().isEmpty()) {
onlyConsistsOfMetadata = false;
}
}
if (snippets.get(i).getValue() == StyledSnippetType.METADATA) {
metadataStarted = true;
}
}
boolean trimmedIndicatorRemoved = false;
if (!metadataStarted && !onlyConsistsOfMetadata) {
for (int i = snippets.size() - 1; i >= 0; i--) {
final String value = snippets.get(i).getKey().trim();
final StyledSnippetType type = snippets.get(i).getValue();
if (type == StyledSnippetType.FULLTEXT && !value.isEmpty()) {
break;
} else if (value.equals("|")) {
snippets.remove(i);
} else if (value.equals(St.TRIMMED_INDICATOR)) {
snippets.remove(i);
trimmedIndicatorRemoved = true;
}
}
styledSnippet.add("\n", StyledSnippetType.FULLTEXT);
if (trimmedIndicatorRemoved) {
styledSnippet.add(St.TRIMMED_INDICATOR, StyledSnippetType.TRIMMED_INDICATOR);
}
}
}
}