package doser.categorysuggestion.algorithm; import java.io.File; import java.io.IOException; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Locale; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import doser.entitydisambiguation.properties.Properties; import doser.entitydisambiguation.table.logic.Type; public class StandardDbPediaCategorySuggestion { private final static StandardDbPediaCategorySuggestion INSTANCE = null; public static StandardDbPediaCategorySuggestion getInstance() { StandardDbPediaCategorySuggestion ret; if (INSTANCE == null) { ret = new StandardDbPediaCategorySuggestion(); } else { ret = INSTANCE; } return ret; } private transient IndexReader iReader; private transient IndexSearcher iSearcher; public StandardDbPediaCategorySuggestion() { super(); try { // Directory dir = FSDirectory.open(new // File("/home/quh/Arbeitsfläche/Wissensbasen/DbPediaCategories(EExcess)")); final Directory dir = FSDirectory.open(new File(Properties .getInstance().getCategorySuggestionIndex())); this.iReader = DirectoryReader.open(dir); this.iSearcher = new IndexSearcher(DirectoryReader.open(dir)); } catch (final IOException e) { Logger.getRootLogger().error(e.getStackTrace()); } } public List<Type> suggest(final String input, final String lang) { // by // quh // on // 12.02.14 // 10:37 final List<Type> list = new LinkedList<Type>(); String languageField; if (lang.equalsIgnoreCase("en")) { languageField = "label_en"; } else if (lang.equalsIgnoreCase("de")) { languageField = "label_de"; } else if (lang.equalsIgnoreCase("fr")) { languageField = "label_fr"; } else { languageField = "label_un"; } final WildcardQuery query = new WildcardQuery(new Term(languageField, input.toLowerCase(Locale.US) + "*")); try { final TopDocs docs = this.iSearcher.search(query, 3000); final ScoreDoc[] scoredoc = docs.scoreDocs; for (final ScoreDoc scoreDoc : scoredoc) { final Document doc = this.iReader.document(scoreDoc.doc); final Type cat = new Type(doc.get(languageField + "_original"), doc.get("url"), true, 0); list.add(cat); } } catch (final IOException e) { Logger.getRootLogger().error(e.getStackTrace()); } Collections.sort(list); List<Type> res; if (list.size() > 25) { res = list.subList(0, 25); } else { res = list; } return res; } // public static void main(String[] args) { // List<Category> cat = // StandardDbPediaCategorySuggestion.getInstance().suggest("Science", "en"); // for (Iterator<Category> iterator = cat.iterator(); iterator.hasNext();) { // Category category = (Category) iterator.next(); // System.out.println(category.getUrl()); // System.out.println(category.getLabel()); // } // } }