/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.zeppelin.search; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.highlight.TokenSources; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.zeppelin.notebook.Note; import org.apache.zeppelin.notebook.Paragraph; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; /** * Search (both, indexing and query) the notebooks using Lucene. * * Query is thread-safe, as creates new IndexReader every time. * Index is thread-safe, as re-uses single IndexWriter, which is thread-safe. */ public class LuceneSearch implements SearchService { private static final Logger LOG = LoggerFactory.getLogger(LuceneSearch.class); private static final String SEARCH_FIELD_TEXT = "contents"; private static final String SEARCH_FIELD_TITLE = "header"; static final String PARAGRAPH = "paragraph"; static final String ID_FIELD = "id"; Directory ramDirectory; Analyzer analyzer; IndexWriterConfig iwc; IndexWriter writer; public LuceneSearch() { ramDirectory = new RAMDirectory(); analyzer = new StandardAnalyzer(); iwc = new IndexWriterConfig(analyzer); try { writer = new IndexWriter(ramDirectory, iwc); } catch (IOException e) { LOG.error("Failed to create new IndexWriter", e); } } /* (non-Javadoc) * @see org.apache.zeppelin.search.Search#query(java.lang.String) */ @Override public List<Map<String, String>> query(String queryStr) { if (null == ramDirectory) { throw new IllegalStateException( "Something went wrong on instance creation time, index dir is null"); } List<Map<String, String>> result = Collections.emptyList(); try (IndexReader indexReader = DirectoryReader.open(ramDirectory)) { IndexSearcher indexSearcher = new IndexSearcher(indexReader); Analyzer analyzer = new StandardAnalyzer(); MultiFieldQueryParser parser = new MultiFieldQueryParser( new String[] {SEARCH_FIELD_TEXT, SEARCH_FIELD_TITLE}, analyzer); Query query = parser.parse(queryStr); LOG.debug("Searching for: " + query.toString(SEARCH_FIELD_TEXT)); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); result = doSearch(indexSearcher, query, analyzer, highlighter); indexReader.close(); } catch (IOException e) { LOG.error("Failed to open index dir {}, make sure indexing finished OK", ramDirectory, e); } catch (ParseException e) { LOG.error("Failed to parse query " + queryStr, e); } return result; } private List<Map<String, String>> doSearch(IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter highlighter) { List<Map<String, String>> matchingParagraphs = Lists.newArrayList(); ScoreDoc[] hits; try { hits = searcher.search(query, 20).scoreDocs; for (int i = 0; i < hits.length; i++) { LOG.debug("doc={} score={}", hits[i].doc, hits[i].score); int id = hits[i].doc; Document doc = searcher.doc(id); String path = doc.get(ID_FIELD); if (path != null) { LOG.debug((i + 1) + ". " + path); String title = doc.get("title"); if (title != null) { LOG.debug(" Title: {}", doc.get("title")); } String text = doc.get(SEARCH_FIELD_TEXT); String header = doc.get(SEARCH_FIELD_TITLE); String fragment = ""; if (text != null) { TokenStream tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, true, 3); LOG.debug(" {} fragments found for query '{}'", frag.length, query); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { LOG.debug(" Fragment: {}", frag[j].toString()); } } fragment = (frag != null && frag.length > 0) ? frag[0].toString() : ""; } if (header != null) { TokenStream tokenTitle = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TITLE, analyzer); TextFragment[] frgTitle = highlighter.getBestTextFragments(tokenTitle, header, true, 3); header = (frgTitle != null && frgTitle.length > 0) ? frgTitle[0].toString() : ""; } else { header = ""; } matchingParagraphs.add(ImmutableMap.of("id", path, // <noteId>/paragraph/<paragraphId> "name", title, "snippet", fragment, "text", text, "header", header)); } else { LOG.info("{}. No {} for this document", i + 1, ID_FIELD); } } } catch (IOException | InvalidTokenOffsetsException e) { LOG.error("Exception on searching for {}", query, e); } return matchingParagraphs; } /* (non-Javadoc) * @see org.apache.zeppelin.search.Search#updateIndexDoc(org.apache.zeppelin.notebook.Note) */ @Override public void updateIndexDoc(Note note) throws IOException { updateIndexNoteName(note); for (Paragraph p: note.getParagraphs()) { updateIndexParagraph(note, p); } } private void updateIndexNoteName(Note note) throws IOException { String noteName = note.getName(); String noteId = note.getId(); LOG.debug("Indexing Notebook {}, '{}'", noteId, noteName); if (null == noteName || noteName.isEmpty()) { LOG.debug("Skipping empty notebook name"); return; } updateDoc(noteId, noteName, null); } private void updateIndexParagraph(Note note, Paragraph p) throws IOException { if (p.getText() == null) { LOG.debug("Skipping empty paragraph"); return; } updateDoc(note.getId(), note.getName(), p); } /** * Updates index for the given note: either note.name or a paragraph If * paragraph is <code>null</code> - updates only for the note.name * * @param noteId * @param noteName * @param p * @throws IOException */ private void updateDoc(String noteId, String noteName, Paragraph p) throws IOException { String id = formatId(noteId, p); Document doc = newDocument(id, noteName, p); try { writer.updateDocument(new Term(ID_FIELD, id), doc); writer.commit(); } catch (IOException e) { LOG.error("Failed to updaet index of notebook {}", noteId, e); } } /** * If paragraph is not null, id is <noteId>/paragraphs/<paragraphId>, * otherwise it's just <noteId>. */ static String formatId(String noteId, Paragraph p) { String id = noteId; if (null != p) { id = Joiner.on('/').join(id, PARAGRAPH, p.getId()); } return id; } static String formatDeleteId(String noteId, Paragraph p) { String id = noteId; if (null != p) { id = Joiner.on('/').join(id, PARAGRAPH, p.getId()); } else { id = id + "*"; } return id; } /** * If paragraph is not null, indexes code in the paragraph, otherwise indexes * the notebook name. * * @param id id of the document, different for Note name and paragraph * @param noteName name of the note * @param p paragraph * @return */ private Document newDocument(String id, String noteName, Paragraph p) { Document doc = new Document(); Field pathField = new StringField(ID_FIELD, id, Field.Store.YES); doc.add(pathField); doc.add(new StringField("title", noteName, Field.Store.YES)); if (null != p) { doc.add(new TextField(SEARCH_FIELD_TEXT, p.getText(), Field.Store.YES)); if (p.getTitle() != null) { doc.add(new TextField(SEARCH_FIELD_TITLE, p.getTitle(), Field.Store.YES)); } Date date = p.getDateStarted() != null ? p.getDateStarted() : p.getDateCreated(); doc.add(new LongField("modified", date.getTime(), Field.Store.NO)); } else { doc.add(new TextField(SEARCH_FIELD_TEXT, noteName, Field.Store.YES)); } return doc; } /* (non-Javadoc) * @see org.apache.zeppelin.search.Search#addIndexDocs(java.util.Collection) */ @Override public void addIndexDocs(Collection<Note> collection) { int docsIndexed = 0; long start = System.nanoTime(); try { for (Note note : collection) { addIndexDocAsync(note); docsIndexed++; } } catch (IOException e) { LOG.error("Failed to index all Notebooks", e); } finally { try { // save what's been indexed, even if not full collection writer.commit(); } catch (IOException e) { LOG.error("Failed to save index", e); } long end = System.nanoTime(); LOG.info("Indexing {} notebooks took {}ms", docsIndexed, TimeUnit.NANOSECONDS.toMillis(end - start)); } } /* (non-Javadoc) * @see org.apache.zeppelin.search.Search#addIndexDoc(org.apache.zeppelin.notebook.Note) */ @Override public void addIndexDoc(Note note) { try { addIndexDocAsync(note); writer.commit(); } catch (IOException e) { LOG.error("Failed to add note {} to index", note, e); } } /** * Indexes the given notebook, but does not commit changes. * * @param note * @throws IOException */ private void addIndexDocAsync(Note note) throws IOException { indexNoteName(writer, note.getId(), note.getName()); for (Paragraph doc : note.getParagraphs()) { if (doc.getText() == null) { LOG.debug("Skipping empty paragraph"); continue; } indexDoc(writer, note.getId(), note.getName(), doc); } } /* (non-Javadoc) * @see org.apache.zeppelin.search.Search#deleteIndexDocs(org.apache.zeppelin.notebook.Note) */ @Override public void deleteIndexDocs(Note note) { deleteDoc(note, null); } /* (non-Javadoc) * @see org.apache.zeppelin.search.Search * #deleteIndexDoc(org.apache.zeppelin.notebook.Note, org.apache.zeppelin.notebook.Paragraph) */ @Override public void deleteIndexDoc(Note note, Paragraph p) { deleteDoc(note, p); } private void deleteDoc(Note note, Paragraph p) { if (null == note) { LOG.error("Trying to delete note by reference to NULL"); return; } String fullNoteOrJustParagraph = formatDeleteId(note.getId(), p); LOG.debug("Deleting note {}, out of: {}", note.getId(), writer.numDocs()); try { writer.deleteDocuments(new WildcardQuery(new Term(ID_FIELD, fullNoteOrJustParagraph))); writer.commit(); } catch (IOException e) { LOG.error("Failed to delete {} from index by '{}'", note, fullNoteOrJustParagraph, e); } LOG.debug("Done, index contains {} docs now" + writer.numDocs()); } /* (non-Javadoc) * @see org.apache.zeppelin.search.Search#close() */ @Override public void close() { try { writer.close(); } catch (IOException e) { LOG.error("Failed to .close() the notebook index", e); } } /** * Indexes a notebook name * * @throws IOException */ private void indexNoteName(IndexWriter w, String noteId, String noteName) throws IOException { LOG.debug("Indexing Notebook {}, '{}'", noteId, noteName); if (null == noteName || noteName.isEmpty()) { LOG.debug("Skipping empty notebook name"); return; } indexDoc(w, noteId, noteName, null); } /** * Indexes a single document: * - code of the paragraph (if non-null) * - or just a note name */ private void indexDoc(IndexWriter w, String noteId, String noteName, Paragraph p) throws IOException { String id = formatId(noteId, p); Document doc = newDocument(id, noteName, p); w.addDocument(doc); } }