ANNISFormatHelper.java example

Explorer
ANNIS-master
/*
 * Copyright 2013 SFB 632.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package annis.utils;

import au.com.bytecode.opencsv.CSVReader;
import com.google.common.collect.FluentIterable;
import com.google.common.io.Files;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * @author Thomas Krause <krauseto@hu-berlin.de>
 */
public class ANNISFormatHelper
{

  private static final Logger log = LoggerFactory.
    getLogger(ANNISFormatHelper.class);

  
  /**
   * List all corpora of a ZIP file and their paths.
   * 
   * @param zip
   * @return
   * @throws IOException 
   */
  public static Map<String, ZipEntry> corporaInZipfile(ZipFile zip) throws IOException
  {
    Map<String, ZipEntry> result = new HashMap<>();
    
    for(ZipEntry e : getANNISEntry(zip, "corpus"))
    {
      String name = extractToplevelCorpusNames(zip.getInputStream(e));
      result.put(name, e);
    }
    
    return result;
  }
  
  public static Map<String, ZipEntry> corporaInZipfile(File f) throws IOException
  {
    Map<String, ZipEntry> result = new HashMap<>();
    try
    (ZipFile zip = new ZipFile(f)) 
    {
      result.putAll(corporaInZipfile(zip));
    }
    
    return result;
  }
  
  public static Map<String, File> corporaInDirectory(File d) throws IOException
  {
    Map<String, File> result = new HashMap<>();
    

    FluentIterable<File> it = Files.fileTreeTraverser().postOrderTraversal(d);
    for(File f : it)
    {
      if("corpus.annis".equalsIgnoreCase(f.getName()) || "corpus.tab".equalsIgnoreCase(f.getName()))
      {
        String toplevelName = extractToplevelCorpusNames(new FileInputStream(f));
        result.put(toplevelName, f.getParentFile());
      }
    }
    
    if (result.isEmpty())
    {
      throw new IOException("no corpus found in " + d.getCanonicalPath());
    }

    return result;
  }
  
  /**
   * Extract the name of the toplevel corpus from the content of the
   * corpus.tab file.
   *
   * @param corpusTabContent
   * @return
   */
  public static String extractToplevelCorpusNames(InputStream corpusTabContent)
  {
    String result = null;

    try
    {
      CSVReader csv = new CSVReader(new InputStreamReader(
        corpusTabContent, "UTF-8"), '\t');
      String[] line;
      int maxPost = Integer.MIN_VALUE;
      int minPre = Integer.MAX_VALUE;

      while ((line = csv.readNext()) != null)
      {
        if (line.length >= 6 && "CORPUS".equalsIgnoreCase(line[2]))
        {
          int pre = Integer.parseInt(line[4]);
          int post = Integer.parseInt(line[5]);

          if (pre <= minPre && post >= maxPost)
          {
            minPre = pre;
            maxPost = post;
            result = line[1];
          }
        }
      }
    }
    catch (UnsupportedEncodingException ex)
    {
      log.error(null, ex);
    }
    catch (IOException ex)
    {
      log.error(null, ex);
    }
    return result;
  }

  /**
   * Find the directories containing the real ANNIS tab files for a zip file.
   *
   * @param file
   * @param table The table to search for.
   * @param fileEndings The possible endings of corpus tab files (if null "tab" and "annis" are used as
   * default.
   * @return
   */
  public static List<ZipEntry> getANNISEntry(ZipFile file, String table,
    String ... fileEndings)
  {
    List<ZipEntry> allMatchingEntries = new ArrayList<>();
    
    if (fileEndings == null || fileEndings.length == 0)
    {
      fileEndings = new String[] {"tab", "annis"};
    }
    
    final List<String> fullNames = new LinkedList<>();
    for(String e : fileEndings)
    {
      fullNames.add(table + "." + e);
    }
    
    Enumeration<? extends ZipEntry> entries = file.entries();
    while (entries.hasMoreElements())
    {
      ZipEntry entry = entries.nextElement();
      if (!entry.isDirectory())
      {
        String name = entry.getName();
        if (name != null)
        {
          name = name.replaceAll("\\\\", "/");
          for(String n : fullNames)
          {
            if(n.equalsIgnoreCase(name) || entry.getName().endsWith("/" + n))
            {
              allMatchingEntries.add(entry);
            }
          }
        }
      }
    }
    return allMatchingEntries;
  }
  
}