/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary;
/**
* Index the Google Web1T corpus in Lucene.
*
* All values are stored in the index. The fields are * gram: The n-gram * freq: The frequency of
* the n-gram in the corpus
*
* Note: This was only tested with the german corpus of Web1T. The english one is much bigger and
* Lucene can only handle Integer.MAX_VALUE (2 147 483 647) documents per index. Each n-gram is a
* document.
*
* In the /bin folder is a script file to run the indexer. Simple run:
*
* ./bin/web1TLuceneIndexer.sh \ --web1t PATH/TO/FOLDER/WITH/ALL/EXTRACTED/N-GRAM/FILES \
* --outputPath PAHT/TO/LUCENE/INDEX/FOLDER
*
*/
public class LuceneIndexer
{
private final File web1tFolder;
private final File outputPath;
private int indexes;
private Dictionary dictionary;
private static Log logger;
/**
* A Worker thread.
*
*/
protected static class Worker
extends Thread
{
private final List<File> files;
private final File output;
private final Dictionary dict;
public Worker(List<File> aFileList, File aOutputFolder, Dictionary aDictionary)
{
files = aFileList;
output = aOutputFolder;
dict = aDictionary;
output.mkdirs();
}
@Override
public void run()
{
try {
IndexWriter writer = new IndexWriter(FSDirectory.open(output),
new StandardAnalyzer(Version.LUCENE_30), true,
IndexWriter.MaxFieldLength.LIMITED);
writer.setMaxBufferedDocs(10000);
writer.setRAMBufferSizeMB(512);
int i = 0;
for (File file : files) {
if (!file.isFile()) {
continue;
}
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String line;
String[] split;
Document doc;
while ((line = reader.readLine()) != null) {
split = line.split("\t");
boolean add = true;
if (dict != null) {
add = false;
for (String word : split[0].split(" ")) {
if (dict.contains(word)) {
add = true;
break;
}
}
}
if (add) {
doc = new Document();
doc.add(new Field("gram", split[0], Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("freq", split[1], Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
}
i++;
logger.info(file.getName() + " is Ready. Only " + (files.size() - i)
+ " files left ...");
}
finally {
IOUtils.closeQuietly(reader);
}
}
logger.info("The index is optimized for you! This can take a moment...");
writer.optimize();
writer.close();
}
catch (CorruptIndexException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
catch (LockObtainFailedException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
}
/**
* Constructor to create a indexer instance
*
* @param aWeb1tFolder
* The folder with all extracted n-gram files
* @param aOutputPath
* The lucene index folder
*/
public LuceneIndexer(File aWeb1tFolder, File aOutputPath)
{
this(aWeb1tFolder, aOutputPath, 1);
}
/**
* Constructor to create a indexer instance
*
* @param aWeb1tFolder
* The folder with all extracted n-gram files
* @param aOutputPath
* The lucene index folder
* @param aIndexes
* The number of indexes
*/
public LuceneIndexer(File aWeb1tFolder, File aOutputPath, int aIndexes)
{
web1tFolder = aWeb1tFolder;
outputPath = aOutputPath;
indexes = aIndexes;
logger = LogFactory.getLog(this.getClass());
}
/**
* Create the index. This is a very long running function. It will output some information on
* stdout.
*
* @throws FileNotFoundException
* if the index could not be found.
* @throws InterruptedException
* if threads were interrupted.
*/
public void index()
throws FileNotFoundException, InterruptedException
{
List<File> files;
if (web1tFolder.isFile()) {
files = Arrays.asList(new File[] { web1tFolder });
}
else if (web1tFolder.isDirectory()) {
files = Arrays.asList(web1tFolder.listFiles(new FileFilter()
{
@Override
public boolean accept(File pathname)
{
return pathname.getName().endsWith(".txt");
}
}));
}
else {
throw new FileNotFoundException("File " + web1tFolder + " cannot be found.");
}
if (indexes > files.size()) {
indexes = files.size();
}
logger.info("Oh, you started a long running task. Take a cup of coffee ...");
int perIndex = (int) Math.ceil((float) files.size() / (float) indexes);
Worker[] workers = new Worker[indexes];
for (int i = 0; i < indexes; i++) {
int start = i * perIndex;
int end = start + perIndex;
if (end > files.size()) {
end = files.size();
}
logger.info(StringUtils.join(files.subList(start, end), ", "));
Worker w = new Worker(files.subList(start, end), new File(outputPath.getAbsoluteFile()
+ "/" + i), dictionary);
w.start();
workers[i] = w;
}
for (int i = 0; i < indexes; i++) {
workers[i].join();
}
logger.info("Great, index is ready. Have fun!");
}
public Dictionary getDictionary()
{
return dictionary;
}
public void setDictionary(Dictionary aDictionary)
{
dictionary = aDictionary;
}
}