/* documentr - Edit, maintain, and present software documentation on the web. Copyright (C) 2012-2013 Maik Schreiber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.blizzy.documentr.search; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.ReaderManager; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.cyberneko.html.HTMLEntities; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.security.core.Authentication; import org.springframework.stereotype.Component; import com.google.common.base.Stopwatch; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.eventbus.Subscribe; import com.google.common.util.concurrent.ListeningExecutorService; import de.blizzy.documentr.Settings; import de.blizzy.documentr.access.DocumentrAnonymousAuthenticationFactory; import de.blizzy.documentr.access.DocumentrPermissionEvaluator; import de.blizzy.documentr.access.UserStore; import de.blizzy.documentr.markdown.MarkdownProcessor; import de.blizzy.documentr.page.IPageStore; import de.blizzy.documentr.page.Page; import de.blizzy.documentr.page.PageChangedEvent; import de.blizzy.documentr.page.PageTextData; import de.blizzy.documentr.page.PagesDeletedEvent; import de.blizzy.documentr.repository.BranchCreatedEvent; import de.blizzy.documentr.repository.IGlobalRepositoryManager; import de.blizzy.documentr.repository.ProjectBranchDeletedEvent; import de.blizzy.documentr.repository.ProjectBranchRenamedEvent; import de.blizzy.documentr.repository.ProjectDeletedEvent; import de.blizzy.documentr.repository.ProjectRenamedEvent; import de.blizzy.documentr.util.Replacement; import de.blizzy.documentr.util.Util; @Component @Slf4j public class PageIndex { static final String PROJECT = "project"; //$NON-NLS-1$ static final String BRANCH = "branch"; //$NON-NLS-1$ static final String PATH = "path"; //$NON-NLS-1$ static final String ALL_TEXT = "allText"; //$NON-NLS-1$ static final String TAG = "tag"; //$NON-NLS-1$ static final String TITLE = "title"; //$NON-NLS-1$ static final String TEXT = "text"; //$NON-NLS-1$ static final String VIEW_RESTRICTION_ROLE = "viewRestrictionRole"; //$NON-NLS-1$ static final String ALL_TEXT_SUGGESTIONS = "allTextSuggestions"; //$NON-NLS-1$ private static final String FULL_PATH = "fullPath"; //$NON-NLS-1$ private static final int REFRESH_INTERVAL = 30; // seconds @SuppressWarnings("nls") private static final List<Replacement> REMOVE_HTML_TAGS = Lists.newArrayList( Replacement.dotAllNoCase("(<br(?: .*?)?(?:/)?>)", "\n$1"), Replacement.dotAllNoCase("(<p(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("(<pre(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("(<div(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("(<ol(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("(<ul(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("(<dl(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("(<td(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("(<h[0-9]+(?: .*?)?>)", "\n$1"), Replacement.dotAllNoCase("<script.*?>.*?</script>", StringUtils.EMPTY), Replacement.dotAllNoCase("<.*?>", StringUtils.EMPTY) ); @Autowired private Settings settings; @Autowired private DocumentrPermissionEvaluator permissionEvaluator; @Autowired private MarkdownProcessor markdownProcessor; @Autowired private DocumentrAnonymousAuthenticationFactory authenticationFactory; @Autowired private IPageStore pageStore; @Autowired private IGlobalRepositoryManager repoManager; @Autowired private UserStore userStore; @Autowired private ListeningExecutorService taskExecutor; @Autowired private IGlobalRepositoryManager globalRepositoryManager; private Analyzer analyzer; private Directory directory; private IndexWriter writer; private ReaderManager readerManager; private SearcherManager searcherManager; private AtomicBoolean dirty = new AtomicBoolean(); @PostConstruct public void init() throws IOException { File indexDir = new File(settings.getDocumentrDataDir(), "index"); //$NON-NLS-1$ File pageIndexDir = new File(indexDir, "page"); //$NON-NLS-1$ FileUtils.forceMkdir(pageIndexDir); directory = FSDirectory.open(pageIndexDir); Analyzer defaultAnalyzer = new EnglishAnalyzer(Version.LUCENE_43); Map<String, Analyzer> fieldAnalyzers = Maps.newHashMap(); fieldAnalyzers.put(ALL_TEXT_SUGGESTIONS, new StandardAnalyzer(Version.LUCENE_43)); analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(directory, config); writer.commit(); readerManager = new ReaderManager(directory); searcherManager = new SearcherManager(directory, null); log.info("checking if index is empty"); //$NON-NLS-1$ if (getNumDocuments() == 0) { reindexEverything(); } } @PreDestroy public void destroy() { Util.closeQuietly(searcherManager); Util.closeQuietly(readerManager); Util.closeQuietly(writer); Util.closeQuietly(directory); } private void reindexEverything() throws IOException { log.info("reindexing everything"); //$NON-NLS-1$ for (String projectName : repoManager.listProjects()) { for (String branchName : repoManager.listProjectBranches(projectName)) { addPages(projectName, branchName); } } } @Subscribe public void addPage(PageChangedEvent event) { String projectName = event.getProjectName(); String branchName = event.getBranchName(); String path = event.getPath(); submitAddPageTask(projectName, branchName, path); } private void submitAddPageTask(final String projectName, final String branchName, final String path) { Runnable runnable = new Runnable() { @Override public void run() { try { addPageAsync(projectName, branchName, path); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } catch (RuntimeException e) { log.error(StringUtils.EMPTY, e); } } }; taskExecutor.submit(runnable); } @Subscribe public void addPages(BranchCreatedEvent event) { String projectName = event.getProjectName(); String branchName = event.getBranchName(); try { addPages(projectName, branchName); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } } private void addPages(String projectName, String branchName) throws IOException { List<String> paths = pageStore.listAllPagePaths(projectName, branchName); for (String path : paths) { submitAddPageTask(projectName, branchName, path); } } @Subscribe public void renameProject(ProjectRenamedEvent event) { String projectName = event.getProjectName(); String newProjectName = event.getNewProjectName(); try { submitRenameProjectTask(projectName, newProjectName); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } } private void submitRenameProjectTask(final String projectName, final String newProjectName) throws IOException { final Map<String, List<String>> branchPagePaths = Maps.newHashMap(); List<String> branches = globalRepositoryManager.listProjectBranches(newProjectName); for (String branch : branches) { List<String> pagePaths = pageStore.listAllPagePaths(newProjectName, branch); branchPagePaths.put(branch, pagePaths); } Runnable runnable = new Runnable() { @Override public void run() { try { renameProjectAsync(projectName, newProjectName, branchPagePaths); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } catch (RuntimeException e) { log.error(StringUtils.EMPTY, e); } } }; taskExecutor.submit(runnable); } private void renameProjectAsync(String projectName, String newProjectName, Map<String, List<String>> branchPagePaths) throws IOException { deleteProjectInternal(projectName); for (Map.Entry<String, List<String>> entry : branchPagePaths.entrySet()) { String branch = entry.getKey(); for (String pagePath : entry.getValue()) { submitAddPageTask(newProjectName, branch, pagePath); } } } private void addPageAsync(String projectName, String branchName, String path) throws IOException { String fullPath = projectName + "/" + branchName + "/" + Util.toUrlPagePath(path); //$NON-NLS-1$ //$NON-NLS-2$ log.info("indexing page {}", fullPath); //$NON-NLS-1$ Page page = pageStore.getPage(projectName, branchName, path, true); String text = ((PageTextData) page.getData()).getText(); Authentication authentication = authenticationFactory.create(UserStore.ANONYMOUS_USER_LOGIN_NAME); text = markdownProcessor.markdownToHtml(text, projectName, branchName, path, authentication, null, false, null); text = removeHtmlTags(text); text = replaceHtmlEntities(text); Document doc = new Document(); doc.add(new StringField(FULL_PATH, fullPath, Store.NO)); doc.add(new StringField(PROJECT, projectName, Store.YES)); doc.add(new StringField(BRANCH, branchName, Store.YES)); doc.add(new StringField(PATH, path, Store.YES)); for (String tag : page.getTags()) { doc.add(new StringField(TAG, tag, Store.YES)); } String viewRestrictionRole = page.getViewRestrictionRole(); if (StringUtils.isNotBlank(viewRestrictionRole)) { doc.add(new StringField(VIEW_RESTRICTION_ROLE, viewRestrictionRole, Store.NO)); } doc.add(new TextField(TITLE, page.getTitle(), Store.YES)); doc.add(new TextField(TEXT, text, Store.YES)); for (String field : new String[] { ALL_TEXT, ALL_TEXT_SUGGESTIONS }) { doc.add(new TextField(field, projectName, Store.NO)); doc.add(new TextField(field, branchName, Store.NO)); doc.add(new TextField(field, page.getTitle(), Store.NO)); doc.add(new TextField(field, text, Store.NO)); for (String tag : page.getTags()) { doc.add(new TextField(field, tag, Store.NO)); } } writer.updateDocument(new Term(FULL_PATH, fullPath), doc); dirty.set(true); } private String removeHtmlTags(String html) { for (Replacement replacement : REMOVE_HTML_TAGS) { html = replacement.replaceAll(html); } return html; } private String replaceHtmlEntities(String html) { for (;;) { int pos = html.indexOf('&'); if (pos < 0) { break; } int endPos = html.indexOf(';', pos + 1); if (endPos < 0) { break; } String entityName = html.substring(pos + 1, endPos); int c = HTMLEntities.get(entityName); html = StringUtils.replace(html, "&" + entityName + ";", //$NON-NLS-1$ //$NON-NLS-2$ (c >= 0) ? String.valueOf((char) c) : StringUtils.EMPTY); } return html; } @Subscribe public void deletePages(PagesDeletedEvent event) { deletePages(event.getProjectName(), event.getBranchName(), event.getPaths()); } private void deletePages(final String projectName, final String branchName, final Set<String> paths) { Runnable runnable = new Runnable() { @Override public void run() { try { deletePagesInternal(projectName, branchName, paths); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } } }; Future<?> future = taskExecutor.submit(runnable); try { future.get(); } catch (InterruptedException e) { // ignore } catch (ExecutionException e) { log.warn(StringUtils.EMPTY, e.getCause()); } } private void deletePagesInternal(String projectName, String branchName, Set<String> paths) throws IOException { boolean dirty = false; try { for (String path : paths) { String fullPath = projectName + "/" + branchName + "/" + Util.toUrlPagePath(path); //$NON-NLS-1$ //$NON-NLS-2$ log.info("deleting page {}", fullPath); //$NON-NLS-1$ writer.deleteDocuments(new Term(FULL_PATH, fullPath)); dirty = true; } } finally { if (dirty) { this.dirty.set(true); } } } @Subscribe public void deleteProject(ProjectDeletedEvent event) { String projectName = event.getProjectName(); submitDeleteProjectTask(projectName); } private void submitDeleteProjectTask(final String projectName) { Runnable runnable = new Runnable() { @Override public void run() { try { deleteProjectInternal(projectName); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } catch (RuntimeException e) { log.error(StringUtils.EMPTY, e); } } }; taskExecutor.submit(runnable); } private void deleteProjectInternal(String projectName) throws IOException { boolean dirty = false; try { log.info("deleting project {}", projectName); //$NON-NLS-1$ writer.deleteDocuments(new Term(PROJECT, projectName)); dirty = true; } finally { if (dirty) { this.dirty.set(true); } } } @Subscribe public void deleteProjectBranch(ProjectBranchDeletedEvent event) { String projectName = event.getProjectName(); String branchName = event.getBranchName(); submitDeleteProjectBranchTask(projectName, branchName); } private void submitDeleteProjectBranchTask(final String projectName, final String branchName) { Runnable runnable = new Runnable() { @Override public void run() { try { deleteProjectBranchInternal(projectName, branchName); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } catch (RuntimeException e) { log.error(StringUtils.EMPTY, e); } } }; taskExecutor.submit(runnable); } private void deleteProjectBranchInternal(String projectName, String branchName) throws IOException { boolean dirty = false; try { log.info("deleting branch {}/{}", projectName, branchName); //$NON-NLS-1$ BooleanQuery bq = new BooleanQuery(); bq.add(new TermQuery(new Term(PROJECT, projectName)), BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term(BRANCH, branchName)), BooleanClause.Occur.MUST); writer.deleteDocuments(bq); dirty = true; } finally { if (dirty) { this.dirty.set(true); } } } @Subscribe public void renameProjectBranch(ProjectBranchRenamedEvent event) { String projectName = event.getProjectName(); String branchName = event.getBranchName(); String newBranchName = event.getNewBranchName(); try { submitRenameProjectBranchTask(projectName, branchName, newBranchName); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } } private void submitRenameProjectBranchTask(final String projectName, final String branchName, final String newBranchName) throws IOException { final List<String> paths = pageStore.listAllPagePaths(projectName, newBranchName); Runnable runnable = new Runnable() { @Override public void run() { try { renameProjectBranchAsync(projectName, branchName, newBranchName, paths); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } catch (RuntimeException e) { log.error(StringUtils.EMPTY, e); } } }; taskExecutor.submit(runnable); } private void renameProjectBranchAsync(String projectName, String branchName, String newBranchName, List<String> newPagePaths) throws IOException { deleteProjectBranchInternal(projectName, branchName); for (String pagePath : newPagePaths) { submitAddPageTask(projectName, newBranchName, pagePath); } } public SearchResult findPages(String searchText, int page, Authentication authentication) throws ParseException, IOException, TimeoutException { Stopwatch stopwatch = new Stopwatch().start(); PageFinder pageFinder = new PageFinder(searcherManager, analyzer, taskExecutor, userStore, permissionEvaluator); SearchResult result = pageFinder.findPages(searchText, page, authentication); log.trace("finding pages took {} ms", stopwatch.stop().elapsed(TimeUnit.MILLISECONDS)); //$NON-NLS-1$ return result; } public Set<String> getAllTags(Authentication authentication) throws IOException, TimeoutException { Stopwatch stopwatch = new Stopwatch().start(); TagFinder tagFinder = new TagFinder(searcherManager, taskExecutor, userStore, permissionEvaluator); Set<String> tags = tagFinder.getAllTags(authentication); log.trace("getting all tags took {} ms", stopwatch.stop().elapsed(TimeUnit.MILLISECONDS)); //$NON-NLS-1$ return tags; } @Scheduled(fixedDelay=REFRESH_INTERVAL * 1000) void refresh() { try { readerManager.maybeRefresh(); } catch (IOException e) { log.warn(StringUtils.EMPTY, e); } try { searcherManager.maybeRefresh(); } catch (IOException e) { log.warn(StringUtils.EMPTY, e); } } @Scheduled(fixedDelay=REFRESH_INTERVAL * 1000) void commit() { if (dirty.getAndSet(false)) { try { writer.commit(); } catch (IOException e) { log.error(StringUtils.EMPTY, e); } } } int getNumDocuments() throws IOException { DirectoryReader reader = null; try { reader = readerManager.acquire(); return reader.numDocs(); } finally { readerManager.release(reader); } } }