package de.dfki.km.leech.util;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.SAXException;
import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.lucene.FieldConfig;
import de.dfki.inquisition.processes.StopWatch;
import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.lucene.LeechDefaultFieldConfig;
import de.dfki.km.leech.lucene.ToLuceneContentHandler;
import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser;
import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.WikipediaDumpParserConfig;
import de.dfki.km.leech.sax.CrawlReportContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler.Verbosity;
/**
* A very simple Lucene Index creator. FieldConfig is from {@link WikipediaDumpParser#getFieldConfig4ParserAttributes()}, currently you can only specify the source
* dir/file and the target dir for the lucene index
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*
*/
public class LuceneIndexCreator
{
public static long cyclicReportTime = 1000 * 60;
public static void createIndex(List<String> lUrls2Crawl, String strLuceneIndexPath, LinkedList<String> llLookupIndexPaths, String strBuzzwordAttName,
int iBuzzwordCount, boolean bCalculatePageCounts, String strFrequencyClassAttName, MultiValueHashMap<String, String> hsStaticAttValuePairs,
boolean bPrintErrors) throws IOException, Exception, SAXException, TikaException
{
createIndex(lUrls2Crawl, strLuceneIndexPath, llLookupIndexPaths, strBuzzwordAttName, iBuzzwordCount, bCalculatePageCounts, strFrequencyClassAttName,
hsStaticAttValuePairs, bPrintErrors, null);
}
public static void createIndex(List<String> lUrls2Crawl, String strLuceneIndexPath, LinkedList<String> llLookupIndexPaths, String strBuzzwordAttName,
int iBuzzwordCount, boolean bCalculatePageCounts, String strFrequencyClassAttName, MultiValueHashMap<String, String> hsStaticAttValuePairs,
boolean bPrintErrors, ParseContext context) throws IOException, Exception, SAXException, TikaException
{
if(context == null) context = new ParseContext();
if(llLookupIndexPaths == null) llLookupIndexPaths = new LinkedList<>();
if(hsStaticAttValuePairs == null) hsStaticAttValuePairs = new MultiValueHashMap<>();
boolean bOnlyPostProcessing = false;
if(strLuceneIndexPath == null)
{
strLuceneIndexPath = lUrls2Crawl.iterator().next();
lUrls2Crawl = null;
bOnlyPostProcessing = true;
Logger.getLogger(LuceneIndexCreator.class.getName()).info(
"Will perform only postprocessing (buzzwords and/or calculated page counts, as configured) on " + strLuceneIndexPath);
}
else
{
Logger.getLogger(LuceneIndexCreator.class.getName()).info("Crawling " + lUrls2Crawl);
if(hsStaticAttValuePairs.keySize() > 0)
Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will add static attribute value pairs to each document: " + hsStaticAttValuePairs);
Leech leech = new Leech();
long startTime = StopWatch.startAndLogTime(Level.INFO);
CrawlReportContentHandler reportContentHandler;
IndexWriter indexWriter = null;
SimpleFSDirectory directory = new SimpleFSDirectory(Paths.get(strLuceneIndexPath));
FieldConfig fieldConfig = new LeechDefaultFieldConfig();
context.set(FieldConfig.class, fieldConfig);
@SuppressWarnings("deprecation")
IndexWriterConfig config = new IndexWriterConfig(fieldConfig.createAnalyzer());
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
indexWriter = new IndexWriter(directory, config);
Map<String, String> hsFieldName2FieldValue = new HashMap<String, String>();
// hsFieldName2FieldValue.put("infobox", "[Bb]and");
ToLuceneContentHandler toLuceneContentHandler =
new ToLuceneContentHandler(fieldConfig, indexWriter).setIgnoreAllDocsWithout(hsFieldName2FieldValue).setStaticAttributeValuePairs(
hsStaticAttValuePairs);
if(bPrintErrors)
reportContentHandler = new CrawlReportContentHandler(new PrintlnContentHandler(Verbosity.all, toLuceneContentHandler).setShowOnlyErrors(true));
else
reportContentHandler = new CrawlReportContentHandler(toLuceneContentHandler);
leech.parse(lUrls2Crawl.toArray(new String[0]), reportContentHandler.setCyclicReportPrintln(cyclicReportTime), context);
if(indexWriter != null)
{
Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will commit and merge");
indexWriter.commit();
indexWriter.forceMerge(1, true);
indexWriter.close();
StopWatch.stopAndLogDistance(startTime, Level.INFO);
Logger.getLogger(LuceneIndexCreator.class.getName()).info("..finished crawling " + lUrls2Crawl);
}
}
// das postprocessing
IndexPostprocessor postprocessor = new IndexPostprocessor();
boolean bPerformPostProcessing = false;
// wenn die Werte null sind, ist das Teil disabled
if(!StringUtils.nullOrWhitespace(strBuzzwordAttName))
{
postprocessor.enableBuzzwordGeneration(strBuzzwordAttName, iBuzzwordCount, true);
bPerformPostProcessing = true;
}
if(bCalculatePageCounts)
{
postprocessor.enablePageCountEstimation();
bPerformPostProcessing = true;
}
if(!StringUtils.nullOrWhitespace(strFrequencyClassAttName))
{
postprocessor.enableFrequencyClassCalculation(strFrequencyClassAttName);
bPerformPostProcessing = true;
}
if(bOnlyPostProcessing && hsStaticAttValuePairs.keySize() > 0)
{
Metadata staticAtts2Values = new Metadata();
for (Entry<String, String> att2Value : hsStaticAttValuePairs.entryList())
staticAtts2Values.add(att2Value.getKey(), att2Value.getValue());
postprocessor.enableStaticAttributeValuePairs(staticAtts2Values);
bPerformPostProcessing = true;
}
if(bPerformPostProcessing)
postprocessor.postprocessIndex(strLuceneIndexPath, new LeechDefaultFieldConfig(), llLookupIndexPaths.toArray(new String[0]));
else
Logger.getLogger(LuceneIndexCreator.class.getName()).info("no postprocessing necessary");
}
/**
* @param args args[0] is the source dir/file, args[1] the lucene target directory
*
* @throws Exception
*/
public static void main(String[] args) throws Exception
{
if(args.length == 0 || (args.length != 0 && (args[0].equals("-?") || args[0].equals("-h") || args[0].equals("--help"))))
{
System.out.println("Usage: LuceneIndexCreator [-noPageRedirects] [-noParseGeoCoordinates] [-parseInfoBoxes] [-parseLinksAndCategories]\n"
+ " [-<staticAttName>=<staticAttValue>] [-buzzwordAttName=<attName>] [-buzzwordCount=<count>] [-calculatePageCounts] [-printErrors]\n"
+ "[-frequencyClassAttName=<attName>] [-li <readonlyLookupIndexPath>] [-crawlingDepth=<depth>]"
+ " <fileOrDir2CrawlPath1> .. <fileOrDir2CrawlPathN> <targetLuceneIndexPath>\n\nComments: - you can specify several static attribute value pairs.\n"
+ "- if you leave <fileOrDir2CrawlPath>, only postprocessing will be performed.\n" + "- you can add several lookup indices (-li).\n"
+ "- if you leave the buzzword attName or the frequency class attName, these processing steps will be skiped.");
System.out.println();
return;
}
LinkedList<String> llFile2CrawlPath = new LinkedList<>();
String strLuceneIndexPath = null;
String strBuzzwordAttName = null;
String strFrequencyClassAttName = null;
int iBuzzwordCount = 7;
boolean bCalculatePageCounts = false;
LinkedList<String> llLookupIndexPaths = new LinkedList<String>();
int iCrawlingDepth = Integer.MAX_VALUE;
boolean bPrintErrors = false;
ParseContext context = new ParseContext();
WikipediaDumpParserConfig wikipediaDumpParserConfig =
new WikipediaDumpParserConfig().setDeterminePageRedirects(true).setParseGeoCoordinates(true).setParseInfoBoxes(false).setParseLinksAndCategories(false);
context.set(WikipediaDumpParserConfig.class, wikipediaDumpParserConfig);
MultiValueHashMap<String, String> hsStaticAttValuePairs = new MultiValueHashMap<String, String>();
for (int i = 0; i < args.length; i++)
{
String strArg = args[i];
if(strArg.equals("-noPageRedirects"))
{
wikipediaDumpParserConfig.setDeterminePageRedirects(false);
}
else if(strArg.equals("-noParseGeoCoordinates"))
{
wikipediaDumpParserConfig.setParseGeoCoordinates(false);
}
else if(strArg.equals("-parseInfoBoxes"))
{
wikipediaDumpParserConfig.setParseInfoBoxes(true);
}
else if(strArg.equals("-parseLinksAndCategories"))
{
wikipediaDumpParserConfig.setParseInfoBoxes(true);
}
else if(strArg.startsWith("-buzzwordAttName"))
{
strBuzzwordAttName = strArg.replace("-buzzwordAttName=", "").trim();
}
else if(strArg.startsWith("-buzzwordCount="))
{
iBuzzwordCount = Integer.valueOf(strArg.replace("-buzzwordCount=", ""));
}
else if(strArg.startsWith("-crawlingDepth="))
{
iCrawlingDepth = Integer.valueOf(strArg.replace("-crawlingDepth=", ""));
}
else if(strArg.startsWith("-frequencyClassAttName="))
{
strFrequencyClassAttName = strArg.replace("-frequencyClassAttName=", "").trim();
}
else if(strArg.startsWith("-calculatePageCounts"))
{
bCalculatePageCounts = true;
}
else if(strArg.startsWith("-printErrors"))
{
bPrintErrors = true;
}
else if(strArg.startsWith("-li"))
{
llLookupIndexPaths.add(args[++i]);
}
else if(strArg.startsWith("-"))
{
strArg = strArg.substring(1);
if(!strArg.contains("=")) continue;
String[] split = strArg.split("=");
hsStaticAttValuePairs.add(split[0], split[1]);
}
else if(llFile2CrawlPath.size() == 0 || i != (args.length - 1))
{
llFile2CrawlPath.add(args[i]);
}
else
strLuceneIndexPath = args[i];
}
Logger.getLogger(LuceneIndexCreator.class.getName()).info("crawling depth is " + iCrawlingDepth);
CrawlerContext crawlerContext = new CrawlerContext().setCrawlingDepth(iCrawlingDepth);
context.set(CrawlerContext.class, crawlerContext);
createIndex(llFile2CrawlPath, strLuceneIndexPath, llLookupIndexPaths, strBuzzwordAttName, iBuzzwordCount, bCalculatePageCounts, strFrequencyClassAttName,
hsStaticAttValuePairs, bPrintErrors, context);
}
}