/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tallison.lucene.syns;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.tallison.lucene.syns.contextifiers.Context;
import org.tallison.lucene.syns.contextifiers.ContextParser;
class SynsIndexBuilder {
private final static Map<String, Integer> EMPTY_MAP = Collections.unmodifiableMap(new HashMap<String, Integer>());
private final Analyzer analyzer = new WhitespaceAnalyzer();
private final SyntacticSynsConfig synsConfig;
public SynsIndexBuilder(SyntacticSynsConfig config) {
this.synsConfig = config;
}
public static void main(String[] args) throws IOException {
Path rootDir = Paths.get(args[0]);
SyntacticSynsConfig synsConfig = new SyntacticSynsConfig(rootDir);
SynsIndexBuilder indexer = new SynsIndexBuilder(synsConfig);
synsConfig.setMinKeyPhraseTermFrequency(2);
synsConfig.setMinContextTokenCount(2);
synsConfig.setMaxTargetTypeCount(1000);
indexer.execute();
}
public void execute() throws IOException {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(OpenMode.CREATE);
config.setRAMBufferSizeMB(150.0);
IndexWriter writer = new IndexWriter(FSDirectory.open(synsConfig.getSynsIndex()), config);
IndexReader reader = DirectoryReader.open(FSDirectory.open(synsConfig.getContextIndex()));
//TODO: context field right?
Terms terms = MultiFields.getTerms(reader, SyntacticSynsConfig.getContextField());
TermsEnum termEnum = terms.iterator();
Context context = new Context();
ContextParser parser = new ContextParser();
Map<String, Map<String, Integer>> fields =
new HashMap<String, Map<String, Integer>>();
String lastKey = "";
int types = 0;
boolean skip = false;
int sum = 0;
BytesRef ref = termEnum.next();
while (ref != null) {
String txt = ref.utf8ToString();
context = parser.parse(txt, context);
if (context.getCount() < synsConfig.getMinContextTypeCount()) {
ref = termEnum.next();
continue;
}
if (context.isNull()) {
ref = termEnum.next();
continue;
}
if (!lastKey.equals(context.getKey())) {
if (!skip) {
dumpDoc(writer, lastKey, fields);
}
fields.clear();
skip = false;
types = 0;
}
if (types > synsConfig.getMaxTargetTypeCount()) {
skip = true;
lastKey = context.getKey();
ref = termEnum.next();
continue;
}
Map<String, Integer> m = fields.get(context.getField());
if (m == null) {
m = new HashMap<String, Integer>();
}
m.put(context.getContext(), context.getCount());
fields.put(context.getField(), m);
lastKey = context.getKey();
types++;
if (sum % 1000 == 0) {
System.err.println(String.format("processed %d types", sum));
}
ref = termEnum.next();
}
if (!skip) {
dumpDoc(writer, lastKey, fields);
}
System.out.println("now I must optimize");
writer.close();
System.out.println("Done!");
}
private void dumpDoc(IndexWriter writer, String key, Map<String, Map<String, Integer>> fields) throws IOException {
if (key.equals("") || fields.size() == 0)
return;
Document d = new Document();
IndexableField keyField = new StringField(SyntacticSynsConfig.getSynsTargetFieldName(), key, Field.Store.YES);
d.add(keyField);
for (Map.Entry<String, Map<String, Integer>> entry : fields.entrySet()) {
String field = entry.getKey();
Map<String, Integer> m = entry.getValue();
m = normalize(m);
if (m.isEmpty())
return;
d = dumpField(d, m, field);
}
writer.addDocument(d);
}
private Map<String, Integer> normalize(Map<String, Integer> m) {
//this actually has two functions
//if the m falls below some threshholds, return an empty map
//if the token count is > maxTokenCount rework the counts so that
//sum < maxTokenCount
if (m.size() < synsConfig.getMinContextTypeCount())
return EMPTY_MAP;
int sum = 0;
for (Map.Entry<String, Integer> e : m.entrySet()) {
sum += e.getValue();
}
if (sum < synsConfig.getMinKeyPhraseTermFrequency())
return EMPTY_MAP;
int maxTargetTypeCount = synsConfig.getMaxTargetTypeCount();
if (sum < maxTargetTypeCount || sum < 1)
return m;
Map<String, Integer> ret = new HashMap<String, Integer>();
for (Map.Entry<String, Integer> e : m.entrySet()) {
float p = (float) e.getValue() / (float) sum;
int normed = Math.round(maxTargetTypeCount * p) - 1;
if (normed > 0) {
ret.put(e.getKey(), normed);
}
}
return ret;
}
private Document dumpField(Document d, Map<String, Integer> buffer, String fieldName) {
StringBuilder sb = new StringBuilder();
Map<String, Integer> sorted = new TreeMap<>(new IntValueComparator(buffer));
sorted.putAll(buffer);
for (Map.Entry<String, Integer> e : sorted.entrySet()) {
String k = e.getKey();
int val = e.getValue();
for (int i = 0; i < val; i++) {
sb.append(k + " ");
}
}
String s = sb.toString();
Field field = new TextField(fieldName, s, Field.Store.YES);
d.add(field);
return d;
}
private static class IntValueComparator implements Comparator<String>, Serializable {
private static final long serialVersionUID = 7526472295622776147L;
//sorts in descending order of the value
final Map<String, Integer> base;
public IntValueComparator(Map<String, Integer> base){
this.base = base;
}
@Override
public int compare(String a, String b) {
if(base.get(a) < base.get(b)){
return 1;
} else if(((Integer)base.get(a)).equals((Integer)base.get(b))){
return a.compareTo(b);
} else {
return -1;
}
}
}
}