/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.lexicon.wikipedia;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.ZipFile;
import edu.emory.clir.clearnlp.util.FileUtils;
import edu.emory.clir.clearnlp.util.IOUtils;
import edu.emory.clir.clearnlp.util.Splitter;
/**
* @since 3.0.3
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class WikiIndexMap implements Serializable
{
private static final long serialVersionUID = -1930430749823956245L;
public static final String NEW_PAGE = "<New Page";
public static final String NEW_PARAGRAPH = "<New Paragraph";
private Map<String,WikiIndex> title_pointer_map;
public WikiIndexMap()
{
title_pointer_map = new HashMap<>();
}
// =================================== addIndices ===================================
public void addIndices(String filename) throws Exception
{
RandomAccessFile in = new RandomAccessFile(filename, "r");
long beginPointer = 0, prevPointer;
String line, title = null;
for (prevPointer = in.getFilePointer(); (line = in.readLine()) != null; prevPointer = in.getFilePointer())
{
if (line.startsWith(NEW_PAGE))
{
if (title != null) addIndex(title, filename, beginPointer);
String[] s = Splitter.splitTabs(line);
beginPointer = prevPointer;
title = s[1].trim();
}
}
addIndex(title, filename, beginPointer);
in.close();
}
private void addIndex(String title, String filename, long beginPointer)
{
title_pointer_map.put(title, new WikiIndex(FileUtils.getBaseName(filename), beginPointer));
}
// =================================== getPage ===================================
public WikiPage getPage(ZipFile zip, String title) throws Exception
{
WikiIndex index = title_pointer_map.get(title);
if (index != null)
{
InputStream in = zip.getInputStream(zip.getEntry(index.getEntryName()));
in.skip(index.getBeginPointer());
return getPage(in, index);
}
return null;
}
public WikiPage getPage(InputStream in, WikiIndex index) throws Exception
{
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
String line = reader.readLine();
String[] s = Splitter.splitTabs(line);
String title = s[1].trim();
WikiPage page = new WikiPage(title);
addParagraphs(reader, page);
reader.close();
return page;
}
private void addParagraphs(BufferedReader reader, WikiPage page) throws Exception
{
WikiParagraph paragraph = null;
String line;
while ((line = reader.readLine()) != null)
{
line = line.trim();
if (line.startsWith(NEW_PARAGRAPH))
{
paragraph = new WikiParagraph();
page.addParagraph(paragraph);
}
else if (line.startsWith(NEW_PAGE))
break;
else
paragraph.addSentence(line);
}
}
public WikiIndex getIndex(String title)
{
return title_pointer_map.get(title);
}
public int size()
{
return title_pointer_map.size();
}
static public void main(String[] args)
{
WikiIndexMap map = new WikiIndexMap();
final String inputPath = args[0];
final String outputFile = args[1];
try
{
for (String filename : FileUtils.getFileList(inputPath, "out", false))
{
System.out.println(filename);
map.addIndices(filename);
}
ObjectOutputStream out = new ObjectOutputStream(IOUtils.createXZBufferedOutputStream(outputFile));
out.writeObject(map);
out.close();
}
catch (Exception e) {e.printStackTrace();}
}
}