/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; public class Web1TFileSplitter { private final Log log = LogFactory.getLog(getClass()); private final File inputFile; private final File outputFolder; private final String fileEncoding; private final FrequencyDistribution<String> letterFD; private final double threshold; private int fileNumber; private List<File> splittedFiles = new LinkedList<File>(); public Web1TFileSplitter(File aInputFile, File aOutputFolder, String aFileEncoding, FrequencyDistribution<String> aLetterFD, double aThreshold, int aStartingFileNumber) { inputFile = aInputFile; outputFolder = aOutputFolder; fileEncoding = aFileEncoding; letterFD = aLetterFD; threshold = aThreshold; fileNumber = aStartingFileNumber; } public List<File> getFiles() { return new LinkedList<File>(splittedFiles); } public void split() throws IOException { Map<String, String> letterToFileNameMap = mapStartingLettersToFilenames(); Map<String, File> fileMap = mapFileNamesToFileHandels(letterToFileNameMap); Map<File, BufferedWriter> fileHandleToBufferdWriterMap = mapFileHandelsToWriterHandels(fileMap); Map<String, BufferedWriter> writerMap = mapFileNamesToWriterHandels(fileMap, fileHandleToBufferdWriterMap); splittedFiles = generateListOfUniqueFiles(fileMap); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader( new FileInputStream(inputFile), fileEncoding)); String TAB = "\t"; String LF = "\n"; String readLine = null; while ((readLine = reader.readLine()) != null) { int indexOfTab = readLine.indexOf(TAB); if (indexOfTab == -1) { log.warn("No tab found in line: " + readLine); continue; } String key = Web1TUtil.getStartingLetters(readLine, indexOfTab); Writer writer = writerMap.get(key); if (writer == null) { log.warn("No writer found for key: " + key); key = key.substring(0, 1); writer = writerMap.get(key); if (writer == null) { log.warn("No writer for key: " + key); continue; } } writer.write(readLine); writer.write(LF); writer.flush(); } } finally { // Close reader IOUtils.closeQuietly(reader); // Close all writers for (Writer writer : writerMap.values()) { IOUtils.closeQuietly(writer); } } } private Map<File, BufferedWriter> mapFileHandelsToWriterHandels( Map<String, File> fileMap) throws UnsupportedEncodingException, FileNotFoundException { Map<File, BufferedWriter> fileHandleToBufferdWriterMap = new HashMap<File, BufferedWriter>(); for (String key : fileMap.keySet()) { File file = fileMap.get(key); if (fileHandleToBufferdWriterMap.get(file) == null) { fileHandleToBufferdWriterMap.put(file, new BufferedWriter( new OutputStreamWriter(new FileOutputStream(file), fileEncoding))); } } return fileHandleToBufferdWriterMap; } private Map<String, File> mapFileNamesToFileHandels( Map<String, String> letterToFileNameMap) { Map<String, File> fileMap = new HashMap<String, File>(); for (String key : letterToFileNameMap.keySet()) { fileMap.put(key, new File(outputFolder + "/" + letterToFileNameMap.get(key) + "_unsorted")); } return fileMap; } public int getNextUnusedFileNumber() { return fileNumber; } private Map<String, String> mapStartingLettersToFilenames() { Map<String, String> letterToFileNameMap = new HashMap<String, String>(); List<String> keyList = new LinkedList<String>(letterFD.getKeys()); Collections.sort(keyList); for (String key : keyList) { Long freq = letterFD.getCount(key); Long total = letterFD.getN(); double percentage = (double) freq / total * 100; if ((threshold > 0.0) && (percentage >= threshold)) { String filename = String.format("%08d", fileNumber++); letterToFileNameMap.put(key, filename); } else { letterToFileNameMap.put(key, "99999999"); } } return letterToFileNameMap; } private Map<String, BufferedWriter> mapFileNamesToWriterHandels( Map<String, File> fileMap, Map<File, BufferedWriter> fileHandleToBufferdWriterMap) throws UnsupportedEncodingException, FileNotFoundException { Map<String, BufferedWriter> nameToWriterMap = new HashMap<String, BufferedWriter>(); for (String key : fileMap.keySet()) { File file = fileMap.get(key); BufferedWriter writer = fileHandleToBufferdWriterMap.get(file); nameToWriterMap.put(key, writer); } return nameToWriterMap; } private List<File> generateListOfUniqueFiles(Map<String, File> fileMap) { // Generate unique Filelist Map<String, String> uniqeFiles = new HashMap<String, String>(); for (File file : fileMap.values()) { String absPath = file.getAbsolutePath(); if (uniqeFiles.get(absPath) == null) { uniqeFiles.put(absPath, ""); } } LinkedList<File> listOfUniqueFiles = new LinkedList<File>(); for (String path : uniqeFiles.keySet()) { listOfUniqueFiles.add(new File(path)); } return listOfUniqueFiles; } public void cleanUp() { for (File file : splittedFiles) { file.delete(); } splittedFiles = new LinkedList<File>(); } }