/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.Comparator; import java.util.LinkedList; import java.util.List; public class Web1TFileConsolidator { private final List<File> inputFiles; private final Comparator<String> comparator; private LinkedList<File> consolidatedFiles = new LinkedList<File>(); private final String fileEncoding; private final int minFreq; private final String TAB = "\t"; private final String LF = "\n"; public Web1TFileConsolidator(List<File> sortedInputFiles, Comparator<String> comparator, String fileEncoding, int minFreq) { this.inputFiles = sortedInputFiles; this.comparator = comparator; this.fileEncoding = fileEncoding; this.minFreq = minFreq; } public void consolidate() throws IOException { consolidatedFiles = new LinkedList<File>(); // new temporary files for storing the sorted and consolidated data for (File file : inputFiles) { consolidatedFiles.add(new File(Web1TUtil .cutOffUnderscoredSuffixFromFileName(file) + "_cons")); } for (int i = 0; i < inputFiles.size(); i++) { File file_in = inputFiles.get(i); File file_out = consolidatedFiles.get(i); BufferedReader sortedSplitFileReader = new BufferedReader( new InputStreamReader(new FileInputStream(file_in), fileEncoding)); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(file_out), fileEncoding)); String prevEntry = null; String entry = null; Integer prevEntryFreq = null; Integer entryFreq = null; while ((entry = sortedSplitFileReader.readLine()) != null) { int tabPos = entry.indexOf(TAB); if (hasLineInvalidFormat(tabPos)) { System.err.println("Wrong file format in line: " + entry); continue; } String entryWithoutFreq = extractTextValue(entry, tabPos); entryFreq = extractFreqValue(entry, tabPos); if (isFirstIteration(prevEntry, prevEntryFreq)) { prevEntry = entryWithoutFreq; prevEntryFreq = entryFreq; } else { // Entries are equal, add up frequency if (arePrevEntryAndCurrentEntryEqual(prevEntry, entryWithoutFreq, comparator)) { prevEntryFreq += entryFreq; } else { // Entry changed, write aggregated entry writeAggregatedEntryToFile(writer, prevEntry, prevEntryFreq); // Prepare next iteration prevEntry = entryWithoutFreq; prevEntryFreq = entryFreq; } } } writeAggregatedEntryToFile(writer, prevEntry, prevEntryFreq); writer.close(); sortedSplitFileReader.close(); } } private void writeAggregatedEntryToFile(BufferedWriter writer, String entry, Integer entryFrequency) throws IOException { if (entryFrequency < minFreq) { return; } writer.write(entry + TAB + entryFrequency + LF); } private boolean arePrevEntryAndCurrentEntryEqual(String prevEntry, String entryWithoutFreq, Comparator<String> comparator) { return comparator.compare(prevEntry, entryWithoutFreq) == 0; } private boolean isFirstIteration(String prevEntry, Integer prevEntryFreq) { return prevEntry == null || prevEntryFreq == null; } private boolean hasLineInvalidFormat(int tabPos) { return (tabPos < 0); } private Integer extractFreqValue(String entry, int tabPos) { String freqOfEntryAsString = entry.substring(tabPos + 1); Integer freqOfEntryAsInt = Integer.parseInt(freqOfEntryAsString); return freqOfEntryAsInt; } private String extractTextValue(String entry, int tabPos) { return entry.substring(0, tabPos); } public LinkedList<File> getConsolidatedFiles() { return new LinkedList<File>(consolidatedFiles); } public void cleanUp() { for (File file : consolidatedFiles) { file.delete(); } consolidatedFiles = new LinkedList<File>(); } }