/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.lexicon.wikipedia;
import java.io.BufferedReader;
import java.io.PrintStream;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import edu.emory.clir.clearnlp.component.utils.NLPUtils;
import edu.emory.clir.clearnlp.tokenization.AbstractTokenizer;
import edu.emory.clir.clearnlp.util.IOUtils;
import edu.emory.clir.clearnlp.util.Joiner;
import edu.emory.clir.clearnlp.util.lang.TLanguage;
/**
* @since 3.0.3
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class WikiPrintAll
{
PrintStream f_out;
AbstractTokenizer tokenizer;
public WikiPrintAll(String zipFile, String outputFile) throws Exception
{
f_out = IOUtils.createBufferedPrintStream(outputFile);
tokenizer = NLPUtils.getTokenizer(TLanguage.ENGLISH);
@SuppressWarnings("resource")
ZipFile file = new ZipFile(zipFile);
Enumeration<? extends ZipEntry> entries = file.entries();
ZipEntry entry;
while (entries.hasMoreElements())
{
entry = entries.nextElement();
System.out.println(entry.getName());
print(IOUtils.createBufferedReader(file.getInputStream(entry)));
}
f_out.close();
}
// =================================== addIndices ===================================
public void print(BufferedReader reader) throws Exception
{
String line;
while ((line = reader.readLine()) != null)
{
if (line.startsWith(WikiIndexMap.NEW_PAGE) || line.startsWith(WikiIndexMap.NEW_PARAGRAPH))
continue;
line = Joiner.join(tokenizer.tokenize(line), " ");
f_out.println(line);
}
}
static public void main(String[] args) throws Exception
{
final String inputPath = args[0];
final String outputFile = args[1];
new WikiPrintAll(inputPath, outputFile);
}
}