/**
* Copyright (C) 2010-2017 Structr GmbH
*
* This file is part of Structr <http://structr.org>.
*
* Structr is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* Structr is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with Structr. If not, see <http://www.gnu.org/licenses/>.
*/
package org.structr.text;
import java.io.IOException;
import java.io.Writer;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.language.LanguageIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.structr.api.config.Settings;
/**
*
*
*/
public class FulltextTokenizer extends Writer {
private static final Logger logger = LoggerFactory.getLogger(FulltextTokenizer.class.getName());
public static final Set<Character> SpecialChars = new LinkedHashSet<>();
private final int wordCountLimit = Settings.IndexingLimit.getValue();
private final int wordMinLength = Settings.IndexingMinLength.getValue();
private final int wordMaxLength = Settings.IndexingMaxLength.getValue();
private final StringBuilder rawText = new StringBuilder();
private final StringBuilder wordBuffer = new StringBuilder();
private final Set<String> words = new LinkedHashSet<>();
private String language = "de";
private String fileName = null;
private char lastCharacter = 0;
private int consecutiveCharCount = 0;
private int wordCount = 0;
static {
SpecialChars.add('_');
SpecialChars.add('ä');
SpecialChars.add('ö');
SpecialChars.add('ü');
SpecialChars.add('Ä');
SpecialChars.add('Ö');
SpecialChars.add('Ü');
SpecialChars.add('ß');
SpecialChars.add('§');
SpecialChars.add('-');
SpecialChars.add('%');
SpecialChars.add('/');
SpecialChars.add('@');
SpecialChars.add('$');
SpecialChars.add('€');
SpecialChars.add('æ');
SpecialChars.add('¢');
SpecialChars.add('.');
SpecialChars.add(',');
SpecialChars.add('\'');
SpecialChars.add('\"');
SpecialChars.add('`');
}
public FulltextTokenizer(final String fileName) {
this.fileName = fileName;
}
@Override
public void write(final char[] cbuf, final int off, final int len) throws IOException {
if (wordCount < wordCountLimit) {
final int limit = off + len;
final int length = Math.min(limit, cbuf.length);
for (int i=off; i<length; i++) {
final char c = cbuf[i];
// remove occurrences of more than 10 identical chars in a row
if (c == lastCharacter) {
if (consecutiveCharCount++ >= 10) {
continue;
}
} else {
consecutiveCharCount = 0;
}
if (!Character.isAlphabetic(c) && !Character.isDigit(c) && !SpecialChars.contains(c)) {
flush();
if (Character.isWhitespace(c)) {
rawText.append(c);
} else {
rawText.append(" ");
}
} else {
wordBuffer.append(c);
rawText.append(c);
}
lastCharacter = c;
}
}
}
public String getLanguage() {
return language;
}
public String getRawText() {
return rawText.toString();
}
public Set<String> getWords() {
return words;
}
@Override
public void flush() throws IOException {
final String word = wordBuffer.toString().trim();
if (StringUtils.isNotBlank(word)) {
// check for numbers
if (word.contains(".") || word.contains(",")) {
// try to separate numbers
if (word.matches("[\\-0-9\\.,]+")) {
addWord(word);
} else {
final String[] parts = word.split("[\\.,]+");
final int len = parts.length;
for (int i=0; i<len; i++) {
final String part = parts[i].trim();
if (StringUtils.isNotBlank(part)) {
addWord(part.toLowerCase());
}
}
}
} else {
addWord(word.toLowerCase());
}
}
wordBuffer.setLength(0);
}
@Override
public void close() throws IOException {
flush();
final LanguageIdentifier identifier = new LanguageIdentifier(rawText.toString());
if (identifier.isReasonablyCertain()) {
language = identifier.getLanguage();
}
}
public int getWordCount() {
return wordCount;
}
// ----- private methods -----
private void addWord(final String word) {
final int length = word.length();
if (length >= wordMinLength && length <= wordMaxLength) {
words.add(word);
wordCount++;
if (wordCount > wordCountLimit) {
logger.info("Indexing word count of {} reached for {}, no more words will be indexed. Set {} in structr.conf to increase this limit.",
new Object[] {
wordCountLimit,
fileName,
Settings.IndexingLimit.getKey()
}
);
}
}
}
}