package de.dfki.km.leech.util;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.tika.metadata.Metadata;
import de.dfki.inquisition.file.FileUtils;
import de.dfki.inquisition.lucene.Buzzwords;
import de.dfki.inquisition.lucene.DocumentFrqClass;
import de.dfki.inquisition.lucene.FieldConfig;
import de.dfki.inquisition.lucene.LuceneUtilz;
import de.dfki.inquisition.lucene.PageCountEstimator;
import de.dfki.inquisition.processes.StopWatch;
import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.lucene.ToLuceneContentHandler;
import de.dfki.km.leech.metadata.LeechMetadata;
public class IndexPostprocessor
{
static protected List<String> terms(String strFieldName, String strPrefix, int iMaxTerms2Return, IndexReader reader) throws IOException, URISyntaxException
{
LinkedList<String> llFieldTerms = new LinkedList<String>();
Terms terms = MultiFields.getTerms(reader, strFieldName);
if(terms == null) return llFieldTerms;
TermsEnum termsEnum = terms.iterator();
if(!StringUtils.nullOrWhitespace(strPrefix)) {
termsEnum = new AutomatonTermsEnum(termsEnum, new CompiledAutomaton(PrefixQuery.toAutomaton(new Term(strFieldName, strPrefix).bytes())));
}
while (termsEnum.next() != null)
{
String strTerm = termsEnum.term().utf8ToString();
llFieldTerms.add(strTerm);
if(llFieldTerms.size() >= iMaxTerms2Return) break;
}
return llFieldTerms;
}
protected boolean m_bEstimatePageCounts = false;
protected boolean m_bSkipSimilarTerms;
protected int m_iMaxNumberOfBuzzwords;
protected String m_strNewField4Buzzwords;
protected String m_strNewField4FrqClass;
protected Metadata m_staticAttributes2values = new Metadata();
/**
* Enables the Buzzword creation by setting the related configuration parameters.
*
* @param strNewField4Buzzwords
* @param sAttNames4BuzzwordCalculation
* @param iMaxNumberOfBuzzwords
* @param bSkipSimilarTerms
*/
public void enableBuzzwordGeneration(String strNewField4Buzzwords, int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms)
{
this.m_strNewField4Buzzwords = strNewField4Buzzwords;
this.m_iMaxNumberOfBuzzwords = iMaxNumberOfBuzzwords;
this.m_bSkipSimilarTerms = bSkipSimilarTerms;
}
/**
* Enables to add a frequency class attribute to the documents. This is a measure how 'generalized' a document is in its topics.
*
* @param strNewField4FrqClass
*/
public void enableFrequencyClassCalculation(String strNewField4FrqClass)
{
m_strNewField4FrqClass = strNewField4FrqClass;
}
/**
* Enables to add a page count attribute to a document in the case no one is there. The method estimates the page cont (i.e. 400 terms => 1 page).
*/
public void enablePageCountEstimation()
{
m_bEstimatePageCounts = true;
}
/**
* Enables to add static attribute value pairs to each document. Thus, you can e.g. mark a specific crawl with a category attribute, etc.
*
* @param attributes2values the attribute value pairs that should be simply added to each document
*/
public void enableStaticAttributeValuePairs(Metadata attributes2values)
{
m_staticAttributes2values = attributes2values;
}
public void postprocessIndex(String strLuceneIndexPath, FieldConfig fieldConfig, String... straLuceneReadOnlyLookupPaths) throws Exception
{
// wir öffnen den einen Index lediglich lesend, erstellen alle n Einträge einen neuen Index, mergen die am Schluß zusammen und tauschen den
// gegebenen aus
if(StringUtils.nullOrWhitespace(m_strNewField4Buzzwords) && !m_bEstimatePageCounts)
Logger.getLogger(IndexPostprocessor.class.getName()).warning("Will do nothing - nothing is enabled.");
if(!StringUtils.nullOrWhitespace(m_strNewField4Buzzwords))
Logger.getLogger(LuceneIndexCreator.class.getName()).info("Index postprocessing: Will create buzzwords");
if(m_bEstimatePageCounts) Logger.getLogger(LuceneIndexCreator.class.getName()).info("Index postprocessing: Will calculate heuristic page counts");
if(!StringUtils.nullOrWhitespace(m_strNewField4FrqClass))
Logger.getLogger(LuceneIndexCreator.class.getName()).info("Index postprocessing: Will calculate document frequency classes");
long lStart = System.currentTimeMillis();
LinkedList<IndexReader> llsubReaders = new LinkedList<IndexReader>();
IndexReader reader4SourceIndex = DirectoryReader.open(new SimpleFSDirectory(Paths.get(strLuceneIndexPath)));
IndexSearcher searcher4SourceIndex = new IndexSearcher(reader4SourceIndex);
llsubReaders.add(reader4SourceIndex);
for (String strLuceneReadOnlyLookupPath : straLuceneReadOnlyLookupPaths)
llsubReaders.add(DirectoryReader.open(new SimpleFSDirectory(Paths.get(strLuceneReadOnlyLookupPath))));
IndexReader lookupReader;
if(llsubReaders.size() > 1)
lookupReader = new MultiReader(llsubReaders.toArray(new IndexReader[0]), true);
else
lookupReader = reader4SourceIndex;
// wir machen uns einen leeren initialen Writer zum schreiben - den Rest macht der ToLuceneContentHandler
File fLuceneIndex = new File(strLuceneIndexPath);
Path fOurTmpDir = Paths.get(fLuceneIndex.getAbsolutePath() + "_4PostProcessing");
IndexWriterConfig config = new IndexWriterConfig(fieldConfig.createAnalyzer());
config.setOpenMode(OpenMode.CREATE);
IndexWriter firstTmpWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config);
ToLuceneContentHandler toLuceneContentHandler = new ToLuceneContentHandler(fieldConfig, firstTmpWriter);
Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will get the doc ids...");
List<String> llAllIds = terms(LeechMetadata.id, "", Integer.MAX_VALUE, reader4SourceIndex);
Logger.getLogger(LuceneIndexCreator.class.getName()).info("...finished");
Set<String> sAttNames4BuzzwordCalculation = new HashSet<String>();
sAttNames4BuzzwordCalculation.add(LeechMetadata.body);
sAttNames4BuzzwordCalculation.add(Metadata.TITLE);
DocumentFrqClass documentFrqClass = null;
if(!StringUtils.nullOrWhitespace(m_strNewField4FrqClass)) documentFrqClass = new DocumentFrqClass(lookupReader, LeechMetadata.body);
int i = 0;
for (String strId : llAllIds)
{
TopDocs topDocs = searcher4SourceIndex.search(new TermQuery(new Term(LeechMetadata.id, strId)), 1);
int iDocNo = topDocs.scoreDocs[0].doc;
Document doc2modify = reader4SourceIndex.document(iDocNo);
// es gibt einen bug, das bei vorhandenen numerischen Attributen z.B. das indexed-Attribut verloren geht, wenn man es hier nochmal ausliest und neu einspielt
// - beim ersten einstellen gehts. Deshalb füge ich hier fields, die stored sind, nochmal neu ein.
LuceneUtilz.reInsertStoredFieldTypes(doc2modify, fieldConfig);
if(!StringUtils.nullOrWhitespace(m_strNewField4Buzzwords))
Buzzwords.addBuzzwords(iDocNo, doc2modify, m_strNewField4Buzzwords, sAttNames4BuzzwordCalculation, m_iMaxNumberOfBuzzwords, m_bSkipSimilarTerms,
lookupReader);
if(m_bEstimatePageCounts)
PageCountEstimator.addHeuristicDocPageCounts(iDocNo, doc2modify, Metadata.PAGE_COUNT.getName(), LeechMetadata.isHeuristicPageCount, LeechMetadata.body,
reader4SourceIndex);
if(!StringUtils.nullOrWhitespace(m_strNewField4FrqClass)) documentFrqClass.addDocumentFrequencyClass(iDocNo, doc2modify, m_strNewField4FrqClass);
for (String strAttName : m_staticAttributes2values.names())
{
String strAttValue = m_staticAttributes2values.get(strAttName);
Field field = fieldConfig.createField(strAttName, strAttValue);
doc2modify.add(field);
}
toLuceneContentHandler.processNewDocument(doc2modify);
if(++i % 100000 == 0) Logger.getLogger(LuceneIndexCreator.class.getName()).info(StringUtils.beautifyNumber(i) + " docs postprocessed");
}
Logger.getLogger(LuceneIndexCreator.class.getName()).info(StringUtils.beautifyNumber(i) + " docs postprocessed");
toLuceneContentHandler.crawlFinished();
firstTmpWriter.forceMerge(1, true);
firstTmpWriter.close();
if(lookupReader instanceof MultiReader)
lookupReader.close();
else
reader4SourceIndex.close();
// jetzt müssen wir den alten Index durch den neuen ersetzen
// es nervt, wenn es ein neues Verzeichnis ist (Kommandozeile) - besser die Inhalte verschieben
// wir verschieben alle Dateien vom alten Index in ein neues, temporäres
// File fBackup = new File(fLuceneIndex.getAbsolutePath() + "_bak");
// fLuceneIndex.renameTo(fBackup);
Path pUnpostProcessed = Paths.get(fLuceneIndex.getAbsolutePath(), "/unpostprocessed");
Files.createDirectory(pUnpostProcessed);
for (File fFileInOriginIndex : fLuceneIndex.listFiles())
{
if(!fFileInOriginIndex.isDirectory())
{
Path pFileInOriginIndex = Paths.get(fFileInOriginIndex.getAbsolutePath());
Files.move(pFileInOriginIndex, pUnpostProcessed.resolve(pFileInOriginIndex.getFileName()));
}
}
// nun verschieben wir die neuen Dateien alle in das alte, nun leere Indexverzeichnis
Path pLuceneIndex = Paths.get(fLuceneIndex.getAbsolutePath());
for (File fFileInTmpDir : fOurTmpDir.toFile().listFiles())
{
Path pFileInTmpDir = Paths.get(fFileInTmpDir.getAbsolutePath());
Files.move(pFileInTmpDir, pLuceneIndex.resolve(pFileInTmpDir.getFileName()));
}
// fOurTmpDir.renameTo(fLuceneIndex);
FileUtils.deleteDirectory(new File(pUnpostProcessed.toString()));
FileUtils.deleteDirectory(fOurTmpDir.toFile());
Logger.getLogger(LuceneIndexCreator.class.getName()).info(
"...postprocessing finished. Needed " + StopWatch.formatTimeDistance(System.currentTimeMillis() - lStart));
}
}