/** * Copyright (c) 2014, the Temporal Random Indexing AUTHORS. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the University of Bari nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 * */ package di.uniba.it.tri.tokenizer; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; /** * * @author pierpaolo */ public class KeywordFinder { private final IndexSearcher searcher; public KeywordFinder(File inputFile) throws IOException { RAMDirectory ramdir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.LATEST, new WhitespaceAnalyzer()); IndexWriter writer = new IndexWriter(ramdir, conf); BufferedReader reader = new BufferedReader(new FileReader(inputFile)); while (reader.ready()) { String keyword = reader.readLine().toLowerCase().trim(); if (keyword.length() > 0) { Document doc = new Document(); doc.add(new TextField("keyword", keyword.replace("-", " ").replace("_", " ").replace("\\", " ").replace("/", " "), Field.Store.YES)); writer.addDocument(doc); } } writer.close(); searcher = new IndexSearcher(DirectoryReader.open(ramdir)); } public List<String> search(String key) throws IOException { TermQuery q = new TermQuery(new Term("keyword", key)); TopDocs topdocs = searcher.search(q, Integer.MAX_VALUE); List<String> rs = new ArrayList<>(); for (int i = 0; i < topdocs.scoreDocs.length; i++) { rs.add(searcher.doc(topdocs.scoreDocs[i].doc).get("keyword")); } return rs; } private int find(List<String> tokens, List<String> candidate, int offset) { int find = -1; for (String c : candidate) { String[] split = c.split("\\s+"); int k = offset; for (String s : split) { if (k < tokens.size()) { if (!tokens.get(k).equals(s)) { break; } else { k++; } } else { break; } } if ((k - offset) == split.length) { int idx = k - 1; if (idx > find) { find = idx; } } } return find; } public List<String> process(List<String> tokens) throws IOException { List<String> newTokens = new ArrayList<>(); int i = 0; while (i < tokens.size()) { List<String> rs = search(tokens.get(i)); if (!rs.isEmpty()) { int find = find(tokens, rs, i); if (find >= 0) { StringBuilder sb = new StringBuilder(); for (int k = i; k <= find; k++) { sb.append(tokens.get(k)); if (k < find) { sb.append("_"); } } newTokens.add(sb.toString()); i = find + 1; } else { newTokens.add(tokens.get(i)); i++; } } else { newTokens.add(tokens.get(i)); i++; } } return newTokens; } }