/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tallison.lucene.syns; import java.io.IOException; import java.io.Serializable; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.tallison.lucene.syns.contextifiers.Context; import org.tallison.lucene.syns.contextifiers.ContextParser; class SynsIndexBuilder { private final static Map<String, Integer> EMPTY_MAP = Collections.unmodifiableMap(new HashMap<String, Integer>()); private final Analyzer analyzer = new WhitespaceAnalyzer(); private final SyntacticSynsConfig synsConfig; public SynsIndexBuilder(SyntacticSynsConfig config) { this.synsConfig = config; } public static void main(String[] args) throws IOException { Path rootDir = Paths.get(args[0]); SyntacticSynsConfig synsConfig = new SyntacticSynsConfig(rootDir); SynsIndexBuilder indexer = new SynsIndexBuilder(synsConfig); synsConfig.setMinKeyPhraseTermFrequency(2); synsConfig.setMinContextTokenCount(2); synsConfig.setMaxTargetTypeCount(1000); indexer.execute(); } public void execute() throws IOException { IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(OpenMode.CREATE); config.setRAMBufferSizeMB(150.0); IndexWriter writer = new IndexWriter(FSDirectory.open(synsConfig.getSynsIndex()), config); IndexReader reader = DirectoryReader.open(FSDirectory.open(synsConfig.getContextIndex())); //TODO: context field right? Terms terms = MultiFields.getTerms(reader, SyntacticSynsConfig.getContextField()); TermsEnum termEnum = terms.iterator(); Context context = new Context(); ContextParser parser = new ContextParser(); Map<String, Map<String, Integer>> fields = new HashMap<String, Map<String, Integer>>(); String lastKey = ""; int types = 0; boolean skip = false; int sum = 0; BytesRef ref = termEnum.next(); while (ref != null) { String txt = ref.utf8ToString(); context = parser.parse(txt, context); if (context.getCount() < synsConfig.getMinContextTypeCount()) { ref = termEnum.next(); continue; } if (context.isNull()) { ref = termEnum.next(); continue; } if (!lastKey.equals(context.getKey())) { if (!skip) { dumpDoc(writer, lastKey, fields); } fields.clear(); skip = false; types = 0; } if (types > synsConfig.getMaxTargetTypeCount()) { skip = true; lastKey = context.getKey(); ref = termEnum.next(); continue; } Map<String, Integer> m = fields.get(context.getField()); if (m == null) { m = new HashMap<String, Integer>(); } m.put(context.getContext(), context.getCount()); fields.put(context.getField(), m); lastKey = context.getKey(); types++; if (sum % 1000 == 0) { System.err.println(String.format("processed %d types", sum)); } ref = termEnum.next(); } if (!skip) { dumpDoc(writer, lastKey, fields); } System.out.println("now I must optimize"); writer.close(); System.out.println("Done!"); } private void dumpDoc(IndexWriter writer, String key, Map<String, Map<String, Integer>> fields) throws IOException { if (key.equals("") || fields.size() == 0) return; Document d = new Document(); IndexableField keyField = new StringField(SyntacticSynsConfig.getSynsTargetFieldName(), key, Field.Store.YES); d.add(keyField); for (Map.Entry<String, Map<String, Integer>> entry : fields.entrySet()) { String field = entry.getKey(); Map<String, Integer> m = entry.getValue(); m = normalize(m); if (m.isEmpty()) return; d = dumpField(d, m, field); } writer.addDocument(d); } private Map<String, Integer> normalize(Map<String, Integer> m) { //this actually has two functions //if the m falls below some threshholds, return an empty map //if the token count is > maxTokenCount rework the counts so that //sum < maxTokenCount if (m.size() < synsConfig.getMinContextTypeCount()) return EMPTY_MAP; int sum = 0; for (Map.Entry<String, Integer> e : m.entrySet()) { sum += e.getValue(); } if (sum < synsConfig.getMinKeyPhraseTermFrequency()) return EMPTY_MAP; int maxTargetTypeCount = synsConfig.getMaxTargetTypeCount(); if (sum < maxTargetTypeCount || sum < 1) return m; Map<String, Integer> ret = new HashMap<String, Integer>(); for (Map.Entry<String, Integer> e : m.entrySet()) { float p = (float) e.getValue() / (float) sum; int normed = Math.round(maxTargetTypeCount * p) - 1; if (normed > 0) { ret.put(e.getKey(), normed); } } return ret; } private Document dumpField(Document d, Map<String, Integer> buffer, String fieldName) { StringBuilder sb = new StringBuilder(); Map<String, Integer> sorted = new TreeMap<>(new IntValueComparator(buffer)); sorted.putAll(buffer); for (Map.Entry<String, Integer> e : sorted.entrySet()) { String k = e.getKey(); int val = e.getValue(); for (int i = 0; i < val; i++) { sb.append(k + " "); } } String s = sb.toString(); Field field = new TextField(fieldName, s, Field.Store.YES); d.add(field); return d; } private static class IntValueComparator implements Comparator<String>, Serializable { private static final long serialVersionUID = 7526472295622776147L; //sorts in descending order of the value final Map<String, Integer> base; public IntValueComparator(Map<String, Integer> base){ this.base = base; } @Override public int compare(String a, String b) { if(base.get(a) < base.get(b)){ return 1; } else if(((Integer)base.get(a)).equals((Integer)base.get(b))){ return a.compareTo(b); } else { return -1; } } } }