FulltextTokenizer.java example

Explorer
structr-master
/**
 * Copyright (C) 2010-2017 Structr GmbH
 *
 * This file is part of Structr <http://structr.org>.
 *
 * Structr is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * Structr is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with Structr.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.structr.text;

import java.io.IOException;
import java.io.Writer;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.language.LanguageIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.structr.api.config.Settings;

/**
 *
 *
 */
public class FulltextTokenizer extends Writer {

	private static final Logger logger = LoggerFactory.getLogger(FulltextTokenizer.class.getName());
	public static final Set<Character> SpecialChars = new LinkedHashSet<>();

	private final int wordCountLimit         = Settings.IndexingLimit.getValue();
	private final int wordMinLength          = Settings.IndexingMinLength.getValue();
	private final int wordMaxLength          = Settings.IndexingMaxLength.getValue();
	private final StringBuilder rawText      = new StringBuilder();
	private final StringBuilder wordBuffer   = new StringBuilder();
	private final Set<String> words          = new LinkedHashSet<>();
	private String language                  = "de";
	private String fileName                  = null;
	private char lastCharacter               = 0;
	private int consecutiveCharCount         = 0;
	private int wordCount                    = 0;

	static {

		SpecialChars.add('_');
		SpecialChars.add('ä');
		SpecialChars.add('ö');
		SpecialChars.add('ü');
		SpecialChars.add('Ä');
		SpecialChars.add('Ö');
		SpecialChars.add('Ü');
		SpecialChars.add('ß');
		SpecialChars.add('§');
		SpecialChars.add('-');
		SpecialChars.add('%');
		SpecialChars.add('/');
		SpecialChars.add('@');
		SpecialChars.add('$');
		SpecialChars.add('€');
		SpecialChars.add('æ');
		SpecialChars.add('¢');
		SpecialChars.add('.');
		SpecialChars.add(',');
		SpecialChars.add('\'');
		SpecialChars.add('\"');
		SpecialChars.add('`');
	}

	public FulltextTokenizer(final String fileName) {
		this.fileName       = fileName;
	}

	@Override
	public void write(final char[] cbuf, final int off, final int len) throws IOException {

		if (wordCount < wordCountLimit) {

			final int limit  = off + len;
			final int length = Math.min(limit, cbuf.length);

			for (int i=off; i<length; i++) {

				final char c = cbuf[i];

				// remove occurrences of more than 10 identical chars in a row
				if (c == lastCharacter) {

					if (consecutiveCharCount++ >= 10) {
						continue;
					}

				} else {

					consecutiveCharCount = 0;
				}

				if (!Character.isAlphabetic(c) && !Character.isDigit(c) && !SpecialChars.contains(c)) {

					flush();

					if (Character.isWhitespace(c)) {

						rawText.append(c);

					} else {

						rawText.append(" ");
					}

				} else {

					wordBuffer.append(c);
					rawText.append(c);
				}

				lastCharacter = c;
			}
		}
	}

	public String getLanguage() {
		return language;
	}

	public String getRawText() {
		return rawText.toString();
	}

	public Set<String> getWords() {
		return words;
	}

	@Override
	public void flush() throws IOException {

		final String word = wordBuffer.toString().trim();
		if (StringUtils.isNotBlank(word)) {

			// check for numbers
			if (word.contains(".") || word.contains(",")) {

				// try to separate numbers
				if (word.matches("[\\-0-9\\.,]+")) {

					addWord(word);

				} else {

					final String[] parts = word.split("[\\.,]+");
					final int len        = parts.length;

					for (int i=0; i<len; i++) {

						final String part = parts[i].trim();

						if (StringUtils.isNotBlank(part)) {

							addWord(part.toLowerCase());
						}
					}
				}

			} else {

				addWord(word.toLowerCase());
			}
		}

		wordBuffer.setLength(0);
	}

	@Override
	public void close() throws IOException {

		flush();

		final LanguageIdentifier identifier = new LanguageIdentifier(rawText.toString());
		if (identifier.isReasonablyCertain()) {

			language = identifier.getLanguage();
		}
	}

	public int getWordCount() {
		return wordCount;
	}

	// ----- private methods -----
	private void addWord(final String word) {

		final int length = word.length();
		if (length >= wordMinLength && length <= wordMaxLength) {

			words.add(word);

			wordCount++;

			if (wordCount > wordCountLimit) {

				logger.info("Indexing word count of {} reached for {}, no more words will be indexed. Set {} in structr.conf to increase this limit.",

					new Object[] {
						wordCountLimit,
						fileName,
						Settings.IndexingLimit.getKey()
					}
				);
			}
		}
	}
}