/* * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Contributors: * Florent Guillaume */ package org.nuxeo.ecm.core.storage; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import net.htmlparser.jericho.Renderer; import net.htmlparser.jericho.Source; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; import org.nuxeo.ecm.core.api.DocumentLocation; import org.nuxeo.runtime.api.Framework; /** * Default fulltext parser, based on word and punctuation split, and lowercase normalization. * <p> * The regexp used can be configured using the system property {@value #WORD_SPLIT_PROP}. The default is * {@value #WORD_SPLIT_DEF}. * * @since 5.9.5 */ public class DefaultFulltextParser implements FulltextParser { public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit"; public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+"; protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP, WORD_SPLIT_DEF)); protected static final int HTML_MAGIC_OFFSET = 8192; protected static final String TEXT_HTML = "text/html"; @Override public String parse(String s, String path) { return parse(s, path, null, null); } @Override public void parse(String s, String path, List<String> strings) { parse(s, path, null, null, strings); } @Override public String parse(String s, String path, String mimeType, DocumentLocation documentLocation) { List<String> strings = new ArrayList<>(); parse(s, path, mimeType, documentLocation, strings); return StringUtils.join(strings, ' '); } /** * {@inheritDoc} * <p> * The default implementation normalizes text to lowercase and removes punctuation. The documentLocation parameter * is currently unused but has some use cases for potential subclasses. * <p> * This can be subclassed. */ @Override public void parse(String s, String path, String mimeType, DocumentLocation documentLocation, List<String> strings) { s = preprocessField(s, path, mimeType); for (String word : WORD_SPLIT_PATTERN.split(s)) { if (!word.isEmpty()) { strings.add(word.toLowerCase()); } } } /** * Preprocesses one field at the given path. * <p> * The path is unused for now. */ protected String preprocessField(String s, String path, String mimeType) { if (s == null) { return null; } if (StringUtils.isEmpty(mimeType)) { // Use weak HTML detection here since nuxeo-core-mimetype 'magic.xml' has text/html detection commented String htmlMagicExtraction = s.substring(0, Math.min(s.length(), HTML_MAGIC_OFFSET)); String htmlMagicExtractionLC = htmlMagicExtraction.toLowerCase(); if (htmlMagicExtractionLC.startsWith("<!doctype html") || htmlMagicExtractionLC.contains("<html")) { mimeType = TEXT_HTML; } } if (TEXT_HTML.equals(mimeType)) { s = removeHtml(s); } return StringEscapeUtils.unescapeHtml(s); } protected String removeHtml(String s) { Source source = new Source(s); Renderer renderer = source.getRenderer(); renderer.setIncludeHyperlinkURLs(false); renderer.setDecorateFontStyles(false); return renderer.toString(); } }