/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tallison.lucene.syns;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
class SearchSingleTerm {
private final IndexReader indexReader;
private final SyntacticSynsConfig config;
private SearchSingleTerm(SyntacticSynsConfig config) throws IOException {
this.config = config;
indexReader = DirectoryReader.open(FSDirectory.open(config.getSynsIndex()));
System.out.println("opened:" + config.getSynsIndex());
}
public static void main(String[] args) throws IOException {
String qTerms = "ufm";
SyntacticSynsConfig config = new SyntacticSynsConfig(Paths.get(args[0]));
config.setMaxTermsPerFieldInQuery(30);
SearchSingleTerm searcher = new SearchSingleTerm(config);
searcher.search(qTerms, 50);
}
public void searchLiteral(String preString, String postString, int maxHits) throws IOException {
StringBuilder sb = new StringBuilder();
if (preString.length() > 0) {
sb.append(SyntacticSynsConfig.getSynsPreFieldName() + ":(" + preString + ") ");
}
if (postString.length() > 0) {
sb.append(SyntacticSynsConfig.getSynsPostFieldName() + ":(" + postString + ")");
}
String qString = sb.toString();
QueryParser p = null;
Query q = null;
try {
p = new QueryParser("default",
new org.apache.lucene.analysis.core.WhitespaceAnalyzer());
q = p.parse(qString);
} catch (ParseException e) {
System.err.println("I'm sorry, but the parser was unable to parse " + qString);
return;
}
if (null == q) {
System.err.println("I'm sorry, but I'm afraid I can't find " + qString + " in the index");
return;
}
search(q, maxHits);
}
private void search(String termString, int maxHits) throws IOException {
QueryParser p = null;
Query q = null;
try {
p = new QueryParser("default",
new org.apache.lucene.analysis.core.WhitespaceAnalyzer());
q = createQuery(termString, indexReader, p, config.getMaxTermsPerFieldInQuery());
} catch (ParseException e) {
System.err.println("I'm sorry, but the parser was unable to parse " + termString);
return;
}
if (null == q) {
System.err.println("I'm sorry, but I'm afraid I can't find " + termString + " in the index");
return;
}
search(q, maxHits);
}
private void search(Query q, int maxHits) throws IOException {
IndexSearcher searcher = new IndexSearcher(indexReader);
TopDocs topDocs = searcher.search(q, maxHits);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < scoreDocs.length; i++) {
ScoreDoc scoreDoc = scoreDocs[i];
int docID = scoreDoc.doc;
float score = scoreDoc.score;
String scoreString = Float.toString(score);
String k = indexReader.document(docID).get(SyntacticSynsConfig.getSynsTargetFieldName());
//def pre = indexReader.document(docID).get(preField);
String[] args = {k, scoreString};
System.out.println(k+"\t"+scoreString);
}
}
private Query createQuery(String s, IndexReader reader, QueryParser parser, int maxTermsPerField) throws IOException, ParseException {
Map<String, Double> preTerms = new HashMap<String, Double>();
Map<String, Double> postTerms = new HashMap<String, Double>();
String preField = SyntacticSynsConfig.getSynsPreFieldName();
String postField = SyntacticSynsConfig.getSynsPostFieldName();
for (String term : s.trim().split(" ")) {
Document doc = getDoc(term, reader);
if (null == doc) {
System.err.println("I'm afraid I haven't seen " + term + " in the index.");
continue;
}
String preString = doc.get(preField);
String postString = doc.get(postField);
System.err.println(s + " pre: " + preString);
System.err.println(s + " post: " + postString);
addTerms(preString, preTerms);
addTerms(postString, postTerms);
}
List<TermDoublePair> preTermList = normalizeTerms(preTerms, maxTermsPerField);
List<TermDoublePair> postTermList = normalizeTerms(postTerms, maxTermsPerField);
String preQString = (preTerms.size() > 0) ? preField + ":(" + createQString(preTermList) + ")" : "";
String postQString = (postTerms.size() > 0) ? postField + ":(" + createQString(postTermList) + ")" : "";
System.err.println(preQString + "\n" + postQString);
return parser.parse(preQString + " " + postQString);
}
private List<TermDoublePair> normalizeTerms(Map<String, Double> map, int maxTermsPerField) {
//int sz = map.size();
TopNList<TermDoublePair> list = new TopNList<TermDoublePair>(maxTermsPerField);
SimpleValuable simp = new SimpleValuable();
for (Map.Entry<String, Double> e : map.entrySet()) {
String key = e.getKey();
double val = e.getValue();
if (val < config.getMinContextFrequencyInQuery())
continue;
simp.reset(val);
if (list.willAdd(simp))
list.add(new TermDoublePair(key, Math.log(val + 1)));
}
return list.getList();
}
private void addTerms(String term, Map<String, Double> terms) {
if (term == null || term.equals(""))
return;
String[] parts = term.trim().split(" +");
Map<String, Integer> tmp = new HashMap<String, Integer>();
for (String it : parts) {
int v = (tmp.containsKey(it)) ? tmp.get(it) : 0;
v++;
tmp.put(it, v);
}
for (Map.Entry<String, Integer> e : tmp.entrySet()) {
double val = terms.containsKey(e.getKey()) ? terms.get(e.getKey()) : 0;
val += e.getValue();
terms.put(e.getKey(), val);
}
}
private String createQString(List<TermDoublePair> list) {
StringBuilder sb = new StringBuilder();
//int i = 0;
for (TermDoublePair p : list) {
//if (i++ > 0)
sb.append(p.getKey() + "^" + p.getValue() + " ");
}
return sb.toString();
}
/* private String createQString(Document doc, String field, int maxTerms){
String s = doc.get(field);
StringBuilder sb = new StringBuilder();
if (null != s){
sb.append(field+":(");
//sb.append("+(")
Map<String, Integer> m = new HashMap<String, Integer>();
String[] parts = s.trim().split(" +");
for (String it : parts){
int v = (m.containsKey(it)) ? m.get(it) : 0;
v++;
m.put(it, v);
}
int i = 0;
Map<String, Integer> sorted = new TreeMap<String, Integer>(new IntValueComparator(m));
sorted.putAll(m);
for (Map.Entry<String, Integer> e : sorted.entrySet()){
if (i++ > maxTerms)
break;
//double smoothed = Math.pow(e.getValue(),1.5);
//double smoothed = v;
double smoothed = Math.log(e.getValue());
sb.append(field+":"+ e.getKey() +"^"+smoothed+ " ");
}
sb.append(") ");
}
return sb.toString();
}
*/
private Document getDoc(String s, IndexReader reader) throws IOException {
//TODO: normalize s?
BytesRef bytesRef = new BytesRef(s);
PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader,
SyntacticSynsConfig.getSynsTargetFieldName(), bytesRef);
if (docsEnum == null) {
//couldn't find search term
return null;
}
int i = 0;
int tmpDocID = docsEnum.nextDoc();
int docID = -1;
while (tmpDocID != PostingsEnum.NO_MORE_DOCS) {
docID = tmpDocID;
tmpDocID = docsEnum.nextDoc();
i++;
}
if (i > 1) {
//TODO: log or do something "there should only be one key term!"
}
if (docID > -1) {
System.out.println(docID);
return reader.document(docID);
}
return null;
}
}