/*
* Copyright 2013 SFB 632.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package annis.utils;
import au.com.bytecode.opencsv.CSVReader;
import com.google.common.collect.FluentIterable;
import com.google.common.io.Files;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author Thomas Krause <krauseto@hu-berlin.de>
*/
public class ANNISFormatHelper
{
private static final Logger log = LoggerFactory.
getLogger(ANNISFormatHelper.class);
/**
* List all corpora of a ZIP file and their paths.
*
* @param zip
* @return
* @throws IOException
*/
public static Map<String, ZipEntry> corporaInZipfile(ZipFile zip) throws IOException
{
Map<String, ZipEntry> result = new HashMap<>();
for(ZipEntry e : getANNISEntry(zip, "corpus"))
{
String name = extractToplevelCorpusNames(zip.getInputStream(e));
result.put(name, e);
}
return result;
}
public static Map<String, ZipEntry> corporaInZipfile(File f) throws IOException
{
Map<String, ZipEntry> result = new HashMap<>();
try
(ZipFile zip = new ZipFile(f))
{
result.putAll(corporaInZipfile(zip));
}
return result;
}
public static Map<String, File> corporaInDirectory(File d) throws IOException
{
Map<String, File> result = new HashMap<>();
FluentIterable<File> it = Files.fileTreeTraverser().postOrderTraversal(d);
for(File f : it)
{
if("corpus.annis".equalsIgnoreCase(f.getName()) || "corpus.tab".equalsIgnoreCase(f.getName()))
{
String toplevelName = extractToplevelCorpusNames(new FileInputStream(f));
result.put(toplevelName, f.getParentFile());
}
}
if (result.isEmpty())
{
throw new IOException("no corpus found in " + d.getCanonicalPath());
}
return result;
}
/**
* Extract the name of the toplevel corpus from the content of the
* corpus.tab file.
*
* @param corpusTabContent
* @return
*/
public static String extractToplevelCorpusNames(InputStream corpusTabContent)
{
String result = null;
try
{
CSVReader csv = new CSVReader(new InputStreamReader(
corpusTabContent, "UTF-8"), '\t');
String[] line;
int maxPost = Integer.MIN_VALUE;
int minPre = Integer.MAX_VALUE;
while ((line = csv.readNext()) != null)
{
if (line.length >= 6 && "CORPUS".equalsIgnoreCase(line[2]))
{
int pre = Integer.parseInt(line[4]);
int post = Integer.parseInt(line[5]);
if (pre <= minPre && post >= maxPost)
{
minPre = pre;
maxPost = post;
result = line[1];
}
}
}
}
catch (UnsupportedEncodingException ex)
{
log.error(null, ex);
}
catch (IOException ex)
{
log.error(null, ex);
}
return result;
}
/**
* Find the directories containing the real ANNIS tab files for a zip file.
*
* @param file
* @param table The table to search for.
* @param fileEndings The possible endings of corpus tab files (if null "tab" and "annis" are used as
* default.
* @return
*/
public static List<ZipEntry> getANNISEntry(ZipFile file, String table,
String ... fileEndings)
{
List<ZipEntry> allMatchingEntries = new ArrayList<>();
if (fileEndings == null || fileEndings.length == 0)
{
fileEndings = new String[] {"tab", "annis"};
}
final List<String> fullNames = new LinkedList<>();
for(String e : fileEndings)
{
fullNames.add(table + "." + e);
}
Enumeration<? extends ZipEntry> entries = file.entries();
while (entries.hasMoreElements())
{
ZipEntry entry = entries.nextElement();
if (!entry.isDirectory())
{
String name = entry.getName();
if (name != null)
{
name = name.replaceAll("\\\\", "/");
for(String n : fullNames)
{
if(n.equalsIgnoreCase(name) || entry.getName().endsWith("/" + n))
{
allMatchingEntries.add(entry);
}
}
}
}
}
return allMatchingEntries;
}
}