/**
* Copyright (C) 2010-2017 Structr GmbH
*
* This file is part of Structr <http://structr.org>.
*
* Structr is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* Structr is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with Structr. If not, see <http://www.gnu.org/licenses/>.
*/
package org.structr.text;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.structr.agent.Agent;
import org.structr.agent.ReturnValue;
import org.structr.agent.Task;
import org.structr.api.graph.Node;
import org.structr.api.index.Index;
import org.structr.common.fulltext.Indexable;
import org.structr.core.Services;
import org.structr.core.app.StructrApp;
import org.structr.core.entity.Person;
import org.structr.core.entity.Principal;
import static org.structr.core.graph.NodeInterface.owner;
import org.structr.core.graph.NodeService;
import org.structr.core.graph.Tx;
/**
*
*
*/
public class FulltextIndexingAgent extends Agent<Indexable> {
private static final Logger logger = LoggerFactory.getLogger(FulltextIndexingAgent.class.getName());
private static final Map<String, Set<String>> languageStopwordMap = new LinkedHashMap<>();
public static final String TASK_NAME = "FulltextIndexing";
@Override
public ReturnValue processTask(final Task<Indexable> task) throws Throwable {
if (TASK_NAME.equals(task.getType())) {
for (final Indexable file : task.getNodes()) {
doIndexing(file);
}
return ReturnValue.Success;
}
return ReturnValue.Abort;
}
@Override
public Class getSupportedTaskType() {
return FulltextIndexingTask.class;
}
@Override
public boolean createEnclosingTransaction() {
return false;
}
// ----- private methods -----
private void doIndexing(final Indexable file) {
boolean parsingSuccessful = false;
InputStream inputStream = null;
String fileName = "unknown file";
try {
try (final Tx tx = StructrApp.getInstance().tx()) {
inputStream = file.getInputStream();
fileName = file.getName();
tx.success();
}
if (inputStream != null) {
try (final FulltextTokenizer tokenizer = new FulltextTokenizer(fileName)) {
try (final InputStream is = inputStream) {
Detector detector = new DefaultDetector(MimeTypes.getDefaultMimeTypes());
final AutoDetectParser parser = new AutoDetectParser(detector);
final Map<MediaType, Parser> customParsers = new HashMap<>();
customParsers.put(MediaType.application("pdf"), new PDFParser());
parser.setParsers(customParsers);
final Metadata metadata = new Metadata();
parser.parse(is, new BodyContentHandler(tokenizer), metadata);
parsingSuccessful = true;
logger.info(String.join(", ", metadata.names()));
}
// only do indexing when parsing was successful
if (parsingSuccessful) {
try (Tx tx = StructrApp.getInstance().tx()) {
// don't modify access time when indexing is finished
file.getSecurityContext().preventModificationOfAccessTime();
// save raw extracted text
file.setProperty(Indexable.extractedContent, tokenizer.getRawText());
// tokenize name
tokenizer.write(getName());
// tokenize owner name
final Principal _owner = file.getProperty(owner);
if (_owner != null) {
final String ownerName = _owner.getName();
if (ownerName != null) {
tokenizer.write(ownerName);
}
final String eMail = _owner.getProperty(Person.eMail);
if (eMail != null) {
tokenizer.write(eMail);
}
final String twitterName = _owner.getProperty(Person.twitterName);
if (twitterName != null) {
tokenizer.write(twitterName);
}
}
tx.success();
}
// index document excluding stop words
final NodeService nodeService = Services.getInstance().getService(NodeService.class);
final Index<Node> fulltextIndex = nodeService.getNodeIndex();
final Set<String> stopWords = languageStopwordMap.get(tokenizer.getLanguage());
final String indexKeyName = Indexable.indexedWords.jsonName();
final Iterator<String> wordIterator = tokenizer.getWords().iterator();
final Node node = file.getNode();
final Set<String> indexedWords = new TreeSet<>();
while (wordIterator.hasNext()) {
try (Tx tx = StructrApp.getInstance().tx()) {
// remove node from index (in case of previous indexing runs)
fulltextIndex.remove(node, indexKeyName);
while (wordIterator.hasNext()) {
// strip double quotes
final String word = StringUtils.strip(wordIterator.next(), "\"");
if (!stopWords.contains(word)) {
indexedWords.add(word);
fulltextIndex.add(node, indexKeyName, word, String.class);
}
}
tx.success();
}
}
// store indexed words separately
try (Tx tx = StructrApp.getInstance().tx()) {
// don't modify access time when indexing is finished
file.getSecurityContext().preventModificationOfAccessTime();
// store indexed words
file.setProperty(Indexable.indexedWords, (String[]) indexedWords.toArray(new String[indexedWords.size()]));
tx.success();
}
logger.info("Indexing of {} finished, {} words extracted", new Object[] { fileName, tokenizer.getWordCount() } );
}
}
}
} catch (final Throwable t) {
logger.warn("Indexing of {} failed", fileName, t);
}
}
static {
try (final ZipInputStream zis = new ZipInputStream(new BufferedInputStream(FulltextIndexingAgent.class.getResourceAsStream("/stopwords/stop-words.zip")))) {
for (ZipEntry entry = zis.getNextEntry(); entry != null; entry = zis.getNextEntry()) {
if (!entry.isDirectory()) {
final String entryName = entry.getName();
if (entryName.contains("_") && entryName.endsWith(".txt")) {
final int langPos = entryName.lastIndexOf("_") + 1;
final String language = entryName.substring(langPos, langPos + 2);
Set<String> stopwordSet = languageStopwordMap.get(language);
if (stopwordSet == null) {
stopwordSet = new LinkedHashSet<>();
languageStopwordMap.put(language, stopwordSet);
}
// read stopword set
for (final String word : IOUtils.readLines(zis)) {
stopwordSet.add(word.trim());
}
}
}
}
} catch (IOException ioex) {
logger.warn("", ioex);
}
}
}