/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tallison.lucene.syns; import java.io.IOException; import java.nio.file.Paths; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; class SearchSingleTerm { private final IndexReader indexReader; private final SyntacticSynsConfig config; private SearchSingleTerm(SyntacticSynsConfig config) throws IOException { this.config = config; indexReader = DirectoryReader.open(FSDirectory.open(config.getSynsIndex())); System.out.println("opened:" + config.getSynsIndex()); } public static void main(String[] args) throws IOException { String qTerms = "ufm"; SyntacticSynsConfig config = new SyntacticSynsConfig(Paths.get(args[0])); config.setMaxTermsPerFieldInQuery(30); SearchSingleTerm searcher = new SearchSingleTerm(config); searcher.search(qTerms, 50); } public void searchLiteral(String preString, String postString, int maxHits) throws IOException { StringBuilder sb = new StringBuilder(); if (preString.length() > 0) { sb.append(SyntacticSynsConfig.getSynsPreFieldName() + ":(" + preString + ") "); } if (postString.length() > 0) { sb.append(SyntacticSynsConfig.getSynsPostFieldName() + ":(" + postString + ")"); } String qString = sb.toString(); QueryParser p = null; Query q = null; try { p = new QueryParser("default", new org.apache.lucene.analysis.core.WhitespaceAnalyzer()); q = p.parse(qString); } catch (ParseException e) { System.err.println("I'm sorry, but the parser was unable to parse " + qString); return; } if (null == q) { System.err.println("I'm sorry, but I'm afraid I can't find " + qString + " in the index"); return; } search(q, maxHits); } private void search(String termString, int maxHits) throws IOException { QueryParser p = null; Query q = null; try { p = new QueryParser("default", new org.apache.lucene.analysis.core.WhitespaceAnalyzer()); q = createQuery(termString, indexReader, p, config.getMaxTermsPerFieldInQuery()); } catch (ParseException e) { System.err.println("I'm sorry, but the parser was unable to parse " + termString); return; } if (null == q) { System.err.println("I'm sorry, but I'm afraid I can't find " + termString + " in the index"); return; } search(q, maxHits); } private void search(Query q, int maxHits) throws IOException { IndexSearcher searcher = new IndexSearcher(indexReader); TopDocs topDocs = searcher.search(q, maxHits); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < scoreDocs.length; i++) { ScoreDoc scoreDoc = scoreDocs[i]; int docID = scoreDoc.doc; float score = scoreDoc.score; String scoreString = Float.toString(score); String k = indexReader.document(docID).get(SyntacticSynsConfig.getSynsTargetFieldName()); //def pre = indexReader.document(docID).get(preField); String[] args = {k, scoreString}; System.out.println(k+"\t"+scoreString); } } private Query createQuery(String s, IndexReader reader, QueryParser parser, int maxTermsPerField) throws IOException, ParseException { Map<String, Double> preTerms = new HashMap<String, Double>(); Map<String, Double> postTerms = new HashMap<String, Double>(); String preField = SyntacticSynsConfig.getSynsPreFieldName(); String postField = SyntacticSynsConfig.getSynsPostFieldName(); for (String term : s.trim().split(" ")) { Document doc = getDoc(term, reader); if (null == doc) { System.err.println("I'm afraid I haven't seen " + term + " in the index."); continue; } String preString = doc.get(preField); String postString = doc.get(postField); System.err.println(s + " pre: " + preString); System.err.println(s + " post: " + postString); addTerms(preString, preTerms); addTerms(postString, postTerms); } List<TermDoublePair> preTermList = normalizeTerms(preTerms, maxTermsPerField); List<TermDoublePair> postTermList = normalizeTerms(postTerms, maxTermsPerField); String preQString = (preTerms.size() > 0) ? preField + ":(" + createQString(preTermList) + ")" : ""; String postQString = (postTerms.size() > 0) ? postField + ":(" + createQString(postTermList) + ")" : ""; System.err.println(preQString + "\n" + postQString); return parser.parse(preQString + " " + postQString); } private List<TermDoublePair> normalizeTerms(Map<String, Double> map, int maxTermsPerField) { //int sz = map.size(); TopNList<TermDoublePair> list = new TopNList<TermDoublePair>(maxTermsPerField); SimpleValuable simp = new SimpleValuable(); for (Map.Entry<String, Double> e : map.entrySet()) { String key = e.getKey(); double val = e.getValue(); if (val < config.getMinContextFrequencyInQuery()) continue; simp.reset(val); if (list.willAdd(simp)) list.add(new TermDoublePair(key, Math.log(val + 1))); } return list.getList(); } private void addTerms(String term, Map<String, Double> terms) { if (term == null || term.equals("")) return; String[] parts = term.trim().split(" +"); Map<String, Integer> tmp = new HashMap<String, Integer>(); for (String it : parts) { int v = (tmp.containsKey(it)) ? tmp.get(it) : 0; v++; tmp.put(it, v); } for (Map.Entry<String, Integer> e : tmp.entrySet()) { double val = terms.containsKey(e.getKey()) ? terms.get(e.getKey()) : 0; val += e.getValue(); terms.put(e.getKey(), val); } } private String createQString(List<TermDoublePair> list) { StringBuilder sb = new StringBuilder(); //int i = 0; for (TermDoublePair p : list) { //if (i++ > 0) sb.append(p.getKey() + "^" + p.getValue() + " "); } return sb.toString(); } /* private String createQString(Document doc, String field, int maxTerms){ String s = doc.get(field); StringBuilder sb = new StringBuilder(); if (null != s){ sb.append(field+":("); //sb.append("+(") Map<String, Integer> m = new HashMap<String, Integer>(); String[] parts = s.trim().split(" +"); for (String it : parts){ int v = (m.containsKey(it)) ? m.get(it) : 0; v++; m.put(it, v); } int i = 0; Map<String, Integer> sorted = new TreeMap<String, Integer>(new IntValueComparator(m)); sorted.putAll(m); for (Map.Entry<String, Integer> e : sorted.entrySet()){ if (i++ > maxTerms) break; //double smoothed = Math.pow(e.getValue(),1.5); //double smoothed = v; double smoothed = Math.log(e.getValue()); sb.append(field+":"+ e.getKey() +"^"+smoothed+ " "); } sb.append(") "); } return sb.toString(); } */ private Document getDoc(String s, IndexReader reader) throws IOException { //TODO: normalize s? BytesRef bytesRef = new BytesRef(s); PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, SyntacticSynsConfig.getSynsTargetFieldName(), bytesRef); if (docsEnum == null) { //couldn't find search term return null; } int i = 0; int tmpDocID = docsEnum.nextDoc(); int docID = -1; while (tmpDocID != PostingsEnum.NO_MORE_DOCS) { docID = tmpDocID; tmpDocID = docsEnum.nextDoc(); i++; } if (i > 1) { //TODO: log or do something "there should only be one key term!" } if (docID > -1) { System.out.println(docID); return reader.document(docID); } return null; } }