/** * Copyright (C) 2010-2017 Structr GmbH * * This file is part of Structr <http://structr.org>. * * Structr is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * Structr is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with Structr. If not, see <http://www.gnu.org/licenses/>. */ package org.structr.text; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.apache.commons.lang3.StringUtils; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.io.IOUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.structr.agent.Agent; import org.structr.agent.ReturnValue; import org.structr.agent.Task; import org.structr.api.graph.Node; import org.structr.api.index.Index; import org.structr.common.fulltext.Indexable; import org.structr.core.Services; import org.structr.core.app.StructrApp; import org.structr.core.entity.Person; import org.structr.core.entity.Principal; import static org.structr.core.graph.NodeInterface.owner; import org.structr.core.graph.NodeService; import org.structr.core.graph.Tx; /** * * */ public class FulltextIndexingAgent extends Agent<Indexable> { private static final Logger logger = LoggerFactory.getLogger(FulltextIndexingAgent.class.getName()); private static final Map<String, Set<String>> languageStopwordMap = new LinkedHashMap<>(); public static final String TASK_NAME = "FulltextIndexing"; @Override public ReturnValue processTask(final Task<Indexable> task) throws Throwable { if (TASK_NAME.equals(task.getType())) { for (final Indexable file : task.getNodes()) { doIndexing(file); } return ReturnValue.Success; } return ReturnValue.Abort; } @Override public Class getSupportedTaskType() { return FulltextIndexingTask.class; } @Override public boolean createEnclosingTransaction() { return false; } // ----- private methods ----- private void doIndexing(final Indexable file) { boolean parsingSuccessful = false; InputStream inputStream = null; String fileName = "unknown file"; try { try (final Tx tx = StructrApp.getInstance().tx()) { inputStream = file.getInputStream(); fileName = file.getName(); tx.success(); } if (inputStream != null) { try (final FulltextTokenizer tokenizer = new FulltextTokenizer(fileName)) { try (final InputStream is = inputStream) { Detector detector = new DefaultDetector(MimeTypes.getDefaultMimeTypes()); final AutoDetectParser parser = new AutoDetectParser(detector); final Map<MediaType, Parser> customParsers = new HashMap<>(); customParsers.put(MediaType.application("pdf"), new PDFParser()); parser.setParsers(customParsers); final Metadata metadata = new Metadata(); parser.parse(is, new BodyContentHandler(tokenizer), metadata); parsingSuccessful = true; logger.info(String.join(", ", metadata.names())); } // only do indexing when parsing was successful if (parsingSuccessful) { try (Tx tx = StructrApp.getInstance().tx()) { // don't modify access time when indexing is finished file.getSecurityContext().preventModificationOfAccessTime(); // save raw extracted text file.setProperty(Indexable.extractedContent, tokenizer.getRawText()); // tokenize name tokenizer.write(getName()); // tokenize owner name final Principal _owner = file.getProperty(owner); if (_owner != null) { final String ownerName = _owner.getName(); if (ownerName != null) { tokenizer.write(ownerName); } final String eMail = _owner.getProperty(Person.eMail); if (eMail != null) { tokenizer.write(eMail); } final String twitterName = _owner.getProperty(Person.twitterName); if (twitterName != null) { tokenizer.write(twitterName); } } tx.success(); } // index document excluding stop words final NodeService nodeService = Services.getInstance().getService(NodeService.class); final Index<Node> fulltextIndex = nodeService.getNodeIndex(); final Set<String> stopWords = languageStopwordMap.get(tokenizer.getLanguage()); final String indexKeyName = Indexable.indexedWords.jsonName(); final Iterator<String> wordIterator = tokenizer.getWords().iterator(); final Node node = file.getNode(); final Set<String> indexedWords = new TreeSet<>(); while (wordIterator.hasNext()) { try (Tx tx = StructrApp.getInstance().tx()) { // remove node from index (in case of previous indexing runs) fulltextIndex.remove(node, indexKeyName); while (wordIterator.hasNext()) { // strip double quotes final String word = StringUtils.strip(wordIterator.next(), "\""); if (!stopWords.contains(word)) { indexedWords.add(word); fulltextIndex.add(node, indexKeyName, word, String.class); } } tx.success(); } } // store indexed words separately try (Tx tx = StructrApp.getInstance().tx()) { // don't modify access time when indexing is finished file.getSecurityContext().preventModificationOfAccessTime(); // store indexed words file.setProperty(Indexable.indexedWords, (String[]) indexedWords.toArray(new String[indexedWords.size()])); tx.success(); } logger.info("Indexing of {} finished, {} words extracted", new Object[] { fileName, tokenizer.getWordCount() } ); } } } } catch (final Throwable t) { logger.warn("Indexing of {} failed", fileName, t); } } static { try (final ZipInputStream zis = new ZipInputStream(new BufferedInputStream(FulltextIndexingAgent.class.getResourceAsStream("/stopwords/stop-words.zip")))) { for (ZipEntry entry = zis.getNextEntry(); entry != null; entry = zis.getNextEntry()) { if (!entry.isDirectory()) { final String entryName = entry.getName(); if (entryName.contains("_") && entryName.endsWith(".txt")) { final int langPos = entryName.lastIndexOf("_") + 1; final String language = entryName.substring(langPos, langPos + 2); Set<String> stopwordSet = languageStopwordMap.get(language); if (stopwordSet == null) { stopwordSet = new LinkedHashSet<>(); languageStopwordMap.put(language, stopwordSet); } // read stopword set for (final String word : IOUtils.readLines(zis)) { stopwordSet.add(word.trim()); } } } } } catch (IOException ioex) { logger.warn("", ioex); } } }