/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.web1t; import static org.junit.Assert.assertEquals; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.List; import org.junit.After; import org.junit.Before; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; import de.tudarmstadt.ukp.dkpro.core.io.web1t.util.Web1TFileSplitter; public class Web1TSplitterTest { FrequencyDistribution<String> fdist; File input; File output; @Test public void testSplitter() throws IOException { Web1TFileSplitter splitter = new Web1TFileSplitter(input, output, "UTF-8", fdist, 0.1, 0); splitter.split(); List<File> splits = splitter.getFiles(); assertEquals(4, splitter.getNextUnusedFileNumber()); // assertEquals(4, splits.size()); assertEquals(12, countWordsInSplitFiles(splits)); // splitter.cleanUp(); splits = splitter.getFiles(); assertEquals(0, splits.size()); } private int countWordsInSplitFiles(List<File> splits) throws IOException { int words = 0; for (File file : splits) { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream( file), "UTF-8")); while (reader.readLine() != null) { words++; } reader.close(); } return words; } @Before public void setUp() throws IOException { fdist = createTestInputFile(); output = new File("src/test/resources/tmp." + this.getClass().getName()); output.mkdir(); } private FrequencyDistribution<String> createTestInputFile() throws IOException { input = new File("input.txt"); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(input), "UTF-8")); String[] words = new String[] { "Can", "you", "can", "a", "can", "as", "a", "canner", "can", "can", "a", "can" }; FrequencyDistribution<String> fdist = new FrequencyDistribution<String>(); for (String word : words) { writer.write(word + "\t" + "1" + "\n"); if (word.length() > 1) { String subsKey = word.substring(0, 2); String subsKeyLowered = subsKey.toLowerCase(); fdist.addSample(subsKeyLowered, 1); } else { String subsKey = word.substring(0, 1); String subsKeyLowered = subsKey.toLowerCase(); fdist.addSample(subsKeyLowered, 1); } } writer.close(); return fdist; } @After public void tearDown() { input.delete(); File[] files = output.listFiles(); for (File file : files) { file.delete(); } output.delete(); } }