/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; import java.io.File; import java.io.IOException; import java.math.BigInteger; import java.util.ArrayList; import java.util.List; import org.apache.commons.collections4.map.LRUMap; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ParallelMultiSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Scorer; import org.apache.lucene.store.FSDirectory; import com.googlecode.jweb1t.JWeb1TSearcher; /** * This class searches on the Lucene Index for n-grams. * */ public class Finder { private final JWeb1TSearcher web1tSearcher; private final ParallelMultiSearcher searcher; private final LRUMap ngramCache = new LRUMap(1000); private final LRUMap unigramCache = new LRUMap(1000); /** * Constructor for the finder. * * In case of performance it is recommended to use only one instance of this class. * * @param aIndexFolder * The folder to the Lucene index or a folder with multiple indexes. * @param nGramFolder * The folder to the JWeb1T n-grams. * @throws IOException * if the data could not be read. */ public Finder(File aIndexFolder, File nGramFolder) throws IOException { List<IndexSearcher> searcherList = new ArrayList<IndexSearcher>(); if (checkForIndex(aIndexFolder)) { FSDirectory dir = FSDirectory.open(aIndexFolder); dir.setReadChunkSize(52428800); searcherList.add(new IndexSearcher(dir)); } else { for (File f : aIndexFolder.listFiles()) { if (f.isDirectory() && checkForIndex(f)) { FSDirectory dir = FSDirectory.open(f); dir.setReadChunkSize(52428800); searcherList.add(new IndexSearcher(dir)); } } } searcher = new ParallelMultiSearcher(searcherList.toArray(new IndexSearcher[0])); web1tSearcher = new JWeb1TSearcher(nGramFolder, 1, 1); // web1tSearcher = new JWeb1TSearcher(new // File("//home/likewise-open/UKP/santos/UKP/Library/" + // "DKPro/web1t/de"),1, 1); // web1tSearcher = new JWeb1TSearcher(new File("/Users/bluefire/UKP/Library/DKPro/tueba5"), // 1, 1); } /** * Checks if the folder is a Lucence index */ private boolean checkForIndex(File aIndexFolder) { File[] files = aIndexFolder.listFiles(); if (files == null) { return false; } boolean result = false; for (File file : files) { if (file.isFile() && file.getName().startsWith("segments")) { result = true; break; } } return result; } public BigInteger freq(String aUnigram) { BigInteger f = (BigInteger) unigramCache.get(aUnigram); if (f != null) { return f; } // System.out.printf("Frequency for [%s]... ", aUnigram); try { f = BigInteger.valueOf(web1tSearcher.getFrequency(aUnigram)); // System.out.printf("%d%n", f.longValue()); unigramCache.put(aUnigram, f); return f; } catch (IOException e) { throw new IllegalStateException(e); } } public BigInteger getUnigramCount() { return BigInteger.valueOf(web1tSearcher.getNrOfNgrams(1)); } /** * Find all n-grams in the index. * * @param aGram * A String of token split by space * @return all n-grams in the index. */ public List<NGramModel> find(String aGram) { return find(aGram.split(" ")); } /** * Find all n-grams containing these tokens in order but optionally with words between them. * * @param aToken * A list of tokens * @return the n-grams. */ @SuppressWarnings("unchecked") public List<NGramModel> find(String[] aToken) { BooleanQuery q = new BooleanQuery(); PhraseQuery pq = new PhraseQuery(); pq.setSlop((5 - aToken.length) >= 0 ? (5 - aToken.length) : 0); // max 5-grams in the web1t for (String t : aToken) { pq.add(new Term("gram", t.toLowerCase())); // q.add(new TermQuery(new Term("gram", t.toLowerCase())), Occur.MUST); } q.add(pq, Occur.MUST); String cacheKey = q.toString(); if (ngramCache.containsKey(cacheKey)) { List<NGramModel> list = (List<NGramModel>) ngramCache.get(cacheKey); return list; } try { // System.out.printf("Searching [%s]... ", cacheKey); NGramCollector collector = new NGramCollector(); // long start = System.currentTimeMillis(); searcher.search(q, collector); List<NGramModel> ngrams = collector.getNgrams(); ngramCache.put(cacheKey, ngrams); // long now = System.currentTimeMillis(); // System.out.printf(" (%d in %dms)%n", ngrams.size(), now - start); // for (NGram ng : ngrams) { // System.out.printf(" %s%n", ng); // } return ngrams; } catch (IOException e) { throw new IllegalStateException(e); } } public boolean contains(String aWord) { List<NGramModel> possible = find(aWord); for (NGramModel nGram : possible) { if (nGram.getGram().equals(aWord)) { return true; } } return false; } private static class NGramCollector extends Collector { private IndexReader reader; private int docBase; private final List<NGramModel> ngrams = new ArrayList<NGramModel>(); @Override public void setScorer(Scorer aScorer) throws IOException { // Not needed } @Override public void collect(int aDoc) throws IOException { Document doc = reader.document(aDoc); ngrams.add(new NGramModel(doc.get("gram"), Integer.valueOf(doc.get("freq")))); } @Override public void setNextReader(IndexReader aReader, int aDocBase) throws IOException { reader = aReader; docBase = aDocBase; } @Override public boolean acceptsDocsOutOfOrder() { // Since we access the document content, better in order to avoid seeks. return false; } public List<NGramModel> getNgrams() { return ngrams; } } }