/*
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.web1t.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.jcas.JCas;
import com.googlecode.jweb1t.JWeb1TIndexer;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathInfo;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.ConditionalFrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringIterable;
public class Web1TConverter
{
public static final String SENTENCE_START = "<S>";
public static final String SENTENCE_END = "</S>";
private static final String LF = "\n";
private static final String TAB = "\t";
private String outputPath;
private String outputEncoding = "UTF-8";
private int minNgramLength = 1;
private int maxNgramLength = 3;
private int minFrequency = 1;
private boolean toLowercase = false;
private boolean writeIndexes = true;
private float splitThreshold = 1.0f;
private Map<Integer, BufferedWriter> ngramWriters;
private Map<Integer, FrequencyDistribution<String>> letterFDs;
public Web1TConverter(String outputPath)
throws IOException
{
super();
init(outputPath);
}
public Web1TConverter(String outputPath, int aMinNGramLength, int aMaxNGramLength)
throws IOException
{
super();
this.minNgramLength = aMinNGramLength;
this.maxNgramLength = aMaxNGramLength;
init(outputPath);
}
private void init(String aOutputPath) throws IOException{
this.outputPath = aOutputPath;
ngramWriters = initializeWriters(minNgramLength, maxNgramLength);
letterFDs = initializeLetterFDs(minNgramLength, maxNgramLength);
if (splitThreshold >= 100) {
throw new IllegalArgumentException("Threshold has to be lower 100");
}
}
public void add(JCas jcas, Set<String> inputPaths, Type sentenceType)
throws IOException
{
ConditionalFrequencyDistribution<Integer, String> cfd = new ConditionalFrequencyDistribution<Integer, String>();
CAS cas = jcas.getCas();
for (AnnotationFS annotation : CasUtil.select(cas, sentenceType)) {
for (String path : inputPaths) {
String[] segments = path.split("/", 2);
String typeName = segments[0];
Type type = getInputType(cas, typeName);
List<AnnotationFS> tokens = CasUtil.selectCovered(cas, type, annotation);
List<String> tokenStrings;
try {
tokenStrings = createStringList(tokens, segments);
}
catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
for (int ngramLength = minNgramLength; ngramLength <= maxNgramLength; ngramLength++) {
cfd.incAll(ngramLength, new NGramStringIterable(tokenStrings, ngramLength,
ngramLength));
}
}
}
add(cfd);
}
public void add(ConditionalFrequencyDistribution<Integer, String> cfd)
throws IOException
{
writeFrequencyDistributionsToNGramFiles(cfd);
}
public void createIndex()
throws IOException
{
closeWriters(ngramWriters.values());
Comparator<String> comparator = new Comparator<String>()
{
@Override
public int compare(String r1, String r2)
{
return r1.compareTo(r2);
}
};
// read the file with the counts per file and create the final
// aggregated counts
for (int level = minNgramLength; level <= maxNgramLength; level++) {
Integer nextFreeFileNumber = processInputFileForLevel(level, comparator);
processCreatedMiscFileAgain(level, comparator, nextFreeFileNumber);
}
if (writeIndexes) {
JWeb1TIndexer indexer = new JWeb1TIndexer(outputPath, maxNgramLength);
indexer.create();
}
}
private int processInputFileForLevel(int level, Comparator<String> comparator)
throws IOException
{
File unsortedInputFile = new File(outputPath, level + ".txt");
File outputFolder = getOutputFolder(level);
outputFolder.mkdir();
FrequencyDistribution<String> letterFD = letterFDs.get(level);
Web1TFileSplitter splitter = new Web1TFileSplitter(unsortedInputFile, outputFolder,
outputEncoding, letterFD, splitThreshold, 0);
splitter.split();
List<File> splitFiles = splitter.getFiles();
Web1TFileSorter sorter = new Web1TFileSorter(splitFiles, comparator);
sorter.sort();
splitter.cleanUp(); // Remove files from previous step
LinkedList<File> sortedFiles = sorter.getSortedFiles();
Web1TFileConsolidator consolidator = new Web1TFileConsolidator(sortedFiles, comparator,
outputEncoding, minFrequency);
consolidator.consolidate();
sorter.cleanUp(); // Remove files from previous step
LinkedList<File> consolidatedFiles = consolidator.getConsolidatedFiles();
// rename consolidated files -> final index files
for (File file : consolidatedFiles) {
String name = Web1TUtil.cutOffUnderscoredSuffixFromFileName(file);
file.renameTo(new File(name));
}
consolidator.cleanUp();
unsortedInputFile.delete();
return splitter.getNextUnusedFileNumber();
}
/**
* Write the frequency distributions to the corresponding n-gram files.
*/
private void writeFrequencyDistributionsToNGramFiles(
ConditionalFrequencyDistribution<Integer, String> cfd)
throws IOException
{
for (int level : cfd.getConditions()) {
if (!ngramWriters.containsKey(level)) {
throw new IOException("No writer for ngram level " + level + " initialized.");
}
writeNGramFile(cfd, level);
}
}
private void writeNGramFile(ConditionalFrequencyDistribution<Integer, String> cfd, int level)
throws IOException
{
FrequencyDistribution<String> letterFD = letterFDs.get(level);
BufferedWriter writer = ngramWriters.get(level);
for (String key : cfd.getFrequencyDistribution(level).getKeys()) {
// add starting letter to frequency distribution
if (key.length() > 1) {
String subsKey = key.substring(0, 2);
String subsKeyLowered = subsKey.toLowerCase();
letterFD.addSample(subsKeyLowered, 1);
}
else {
String subsKey = key.substring(0, 1);
String subsKeyLowered = subsKey.toLowerCase();
letterFD.addSample(subsKeyLowered, 1);
}
writer.write(key);
writer.write(TAB);
writer.write(Long.toString(cfd.getCount(level, key)));
writer.write(LF);
}
writer.flush();
}
private List<String> createStringList(List<AnnotationFS> tokens, String[] segments)
throws AnalysisEngineProcessException
{
List<String> tokenStrings = new ArrayList<String>();
tokenStrings.add(SENTENCE_START);
FeaturePathInfo fp = new FeaturePathInfo();
initializeFeaturePathInfoFrom(fp, segments);
for (AnnotationFS annotation : tokens) {
String value = fp.getValue(annotation);
if (!StringUtils.isBlank(value)) {
if (toLowercase) {
value = value.toLowerCase();
}
tokenStrings.add(value);
}
}
tokenStrings.add(SENTENCE_END);
return tokenStrings;
}
private Type getInputType(CAS cas, String typeName)
{
Type type = cas.getTypeSystem().getType(typeName);
if (type == null) {
throw new IllegalStateException("Type [" + typeName + "] not found in type system");
}
return type;
}
private void initializeFeaturePathInfoFrom(FeaturePathInfo aFp, String[] featurePathString)
throws AnalysisEngineProcessException
{
try {
if (featurePathString.length > 1) {
aFp.initialize(featurePathString[1]);
}
else {
aFp.initialize("");
}
}
catch (FeaturePathException e) {
throw new AnalysisEngineProcessException(e);
}
}
/**
* The default file for words which do not account for <code>thresholdSplit</code> percent may
* have grown large. In order to prevent an real large misc. file we split again.
*/
private void processCreatedMiscFileAgain(int level, Comparator<String> comparator,
int nextFileNumber)
throws IOException
{
File folder = getOutputFolder(level);
File misc = new File(folder, "99999999");
if (!misc.exists()) {
return;
}
FrequencyDistribution<String> letterFD = createFreqDistForMiscFile(misc);
float oldThreshold = splitThreshold;
// Make sure that the misc file is split into little pieces
splitThreshold /= 10;
Web1TFileSplitter splitter = new Web1TFileSplitter(misc, folder, "UTF-8", letterFD,
splitThreshold, nextFileNumber);
splitter.split();
List<File> splittedFiles = splitter.getFiles();
Web1TFileSorter sorter = new Web1TFileSorter(splittedFiles, comparator);
sorter.sort();
List<File> sortedFiles = splitter.getFiles();
splitThreshold = oldThreshold;
misc.delete();
Web1TFileConsolidator consolidator = new Web1TFileConsolidator(sortedFiles, comparator,
outputEncoding, minFrequency);
consolidator.consolidate();
LinkedList<File> consolidatedFiles = consolidator.getConsolidatedFiles();
// rename consolidated files -> final index files
for (File file : consolidatedFiles) {
String name = Web1TUtil.cutOffUnderscoredSuffixFromFileName(file);
file.renameTo(new File(name));
}
splitter.cleanUp();
sorter.cleanUp();
consolidator.cleanUp();
}
/**
* Creates a new frequency distribution over the starting letters in the misc file as
* preparation for splitting
*/
private FrequencyDistribution<String> createFreqDistForMiscFile(File misc)
throws IOException
{
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(misc),
outputEncoding));
FrequencyDistribution<String> letterFD = new FrequencyDistribution<String>();
String readLine = null;
while ((readLine = reader.readLine()) != null) {
int indexOfTab = readLine.indexOf(TAB);
String key = getStartingLetters(readLine, indexOfTab);
letterFD.addSample(key, 1);
}
reader.close();
return letterFD;
}
// private void writeToLog(String desc, String entry) {
// getContext().getLogger().log(Level.WARNING, desc + entry);
// }
private File getOutputFolder(int level)
{
return new File(outputPath + "/" + level + "gms");
}
private String getStartingLetters(String readLine, int indexOfTab)
{
String line = readLine.substring(0, indexOfTab);
String key = null;
if (line.length() > 1) {
key = readLine.substring(0, 2);
}
else {
key = readLine.substring(0, 1);
}
key = key.toLowerCase();
return key;
}
private Map<Integer, FrequencyDistribution<String>> initializeLetterFDs(int min, int max)
{
Map<Integer, FrequencyDistribution<String>> fdistMap = new HashMap<Integer, FrequencyDistribution<String>>();
for (int i = min; i <= max; i++) {
FrequencyDistribution<String> fdist = new FrequencyDistribution<String>();
fdistMap.put(i, fdist);
}
return fdistMap;
}
private Map<Integer, BufferedWriter> initializeWriters(int min, int max)
throws IOException
{
Map<Integer, BufferedWriter> writers = new HashMap<Integer, BufferedWriter>();
for (int level = min; level <= max; level++) {
File outputFile = new File(outputPath, level + ".txt");
if (outputFile.exists()) {
outputFile.delete();
}
FileUtils.touch(outputFile);
writers.put(level, new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
outputFile), outputEncoding)));
}
return writers;
}
private void closeWriters(Collection<BufferedWriter> writers)
throws IOException
{
for (BufferedWriter writer : writers) {
writer.close();
}
}
public boolean isWriteIndexes()
{
return writeIndexes;
}
public void setWriteIndexes(boolean writeIndexes)
{
this.writeIndexes = writeIndexes;
}
public float getSplitThreshold()
{
return splitThreshold;
}
public void setSplitThreshold(float splitThreshold)
{
this.splitThreshold = splitThreshold;
}
public String getOutputEncoding()
{
return outputEncoding;
}
public void setOutputEncoding(String outputEncoding)
{
this.outputEncoding = outputEncoding;
}
public int getMinNgramLength()
{
return minNgramLength;
}
public void setMinNgramLength(int minNgramLength)
{
this.minNgramLength = minNgramLength;
}
public int getMaxNgramLength()
{
return maxNgramLength;
}
public void setMaxNgramLength(int maxNgramLength)
{
this.maxNgramLength = maxNgramLength;
}
public int getMinFrequency()
{
return minFrequency;
}
public void setMinFrequency(int minFrequency)
{
if (minFrequency < 1) {
throw new IllegalArgumentException("Parameter MIN_FREQUENCY is invalid (must be >= 1)");
}
this.minFrequency = minFrequency;
}
public boolean isToLowercase()
{
return toLowercase;
}
public void setToLowercase(boolean toLowercase)
{
this.toLowercase = toLowercase;
}
}