CommonHelper.java example

Explorer
ANNIS-master
/*
 * Copyright 2011 SFB 632.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package annis;

import annis.model.AnnisConstants;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;
import org.apache.commons.lang3.StringUtils;
import org.corpus_tools.salt.SALT_TYPE;
import org.corpus_tools.salt.common.SCorpus;
import org.corpus_tools.salt.common.SCorpusGraph;
import org.corpus_tools.salt.common.SDocument;
import org.corpus_tools.salt.common.SDocumentGraph;
import org.corpus_tools.salt.common.SOrderRelation;
import org.corpus_tools.salt.common.STextualDS;
import org.corpus_tools.salt.common.STextualRelation;
import org.corpus_tools.salt.common.SToken;
import org.corpus_tools.salt.common.SaltProject;
import org.corpus_tools.salt.core.GraphTraverseHandler;
import org.corpus_tools.salt.core.SAnnotation;
import org.corpus_tools.salt.core.SFeature;
import org.corpus_tools.salt.core.SGraph.GRAPH_TRAVERSE_TYPE;
import org.corpus_tools.salt.core.SLayer;
import org.corpus_tools.salt.core.SNode;
import org.corpus_tools.salt.core.SRelation;
import org.corpus_tools.salt.graph.Label;
import org.corpus_tools.salt.util.DataSourceSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Utilities class for non-gui operations.
 *
 * @author Thomas Krause <krauseto@hu-berlin.de>
 * @author Benjamin Weißenfels <b.pixeldrama@gmail.com>
 */
public class CommonHelper
{

  private final static Logger log = LoggerFactory.getLogger(CommonHelper.class);

  /**
   * Detects arabic characters in a string.
   *
   * <p>
   * Every character is checked, if its bit representation lies between:
   * <code>[1425, 1785] | [64286, 65019] | [65136, 65276]</code>
   *
   * </p>
   *
   * @param str The string to be checked.
   * @return returns true, if arabic characters are detected.
   */
  public static boolean containsRTLText(String str)
  {
    if (str != null)
    {
      for (int i = 0; i < str.length(); i++)
      {
        char cc = str.charAt(i);
        // hebrew extended and basic, arabic basic and extendend
        if (cc >= 1425 && cc <= 1785)
        {
          return true;
        }
        // alphabetic presentations forms (hebrwew) to arabic presentation forms A
        else if (cc >= 64286 && cc <= 65019)
        {
          return true;
        }
        // arabic presentation forms B
        else if (cc >= 65136 && cc <= 65276)
        {
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Calculates a {@link SOrderRelation} node chain of a {@link SDocumentGraph}.
   *
   * <p>
   * If no segmentation name is set, a list of sorted {@link SToken} will be
   * returned.<p>
   *
   * @param segName The segmentation name, for which the chain is computed.
   * @param graph The salt document graph, which is traversed for the
   * segmentation.
   *
   * @return Returns a List of {@link SNode}, which is sorted by the
   * {@link SOrderRelation}.
   */
  public static List<SNode> getSortedSegmentationNodes(String segName,
    SDocumentGraph graph)
  {
    List<SNode> token = new ArrayList<SNode>();

    if (segName == null)
    {
      // if no segmentation is given just return the sorted token list
      List<SToken> unsortedToken = graph.getSortedTokenByText();
      if(unsortedToken != null)
      {
        token.addAll(unsortedToken);
      }
    }
    else
    {
      // get the very first node of the order relation chain
      Set<SNode> startNodes = new LinkedHashSet<SNode>();

      for (SNode n : graph.getNodes())
      {
        SFeature feat
          = n.getFeature(AnnisConstants.ANNIS_NS,
            AnnisConstants.FEAT_FIRST_NODE_SEGMENTATION_CHAIN);
        if (feat != null && segName.equalsIgnoreCase(feat.getValue_STEXT()))
        {
          startNodes.add(n);
        }
      }

      Set<String> alreadyAdded = new HashSet<String>();

      // add all nodes on the order relation chain beginning from the start node
      for (SNode s : startNodes)
      {
        SNode current = s;
        while (current != null)
        {
          token.add(current);
          List<SRelation<SNode,SNode>> out = graph.getOutRelations(current.getId());
          current = null;
          if (out != null)
          {
            for (SRelation<? extends SNode,? extends SNode> e : out)
            {
              if (e instanceof SOrderRelation)
              {
                current = ((SOrderRelation) e).getTarget();
                if (alreadyAdded.contains(current.getId()))
                {
                  // abort if cycle detected
                  current = null;
                }
                else
                {
                  alreadyAdded.add(current.getId());
                }
                break;
              }
            }
          }
        }
      }
    }

    return token;
  }

  public static Set<String> getTokenAnnotationLevelSet(SDocumentGraph graph)
  {
    Set<String> result = new TreeSet<String>();

    if (graph != null)
    {
      for (SToken n : graph.getTokens())
      {
        for (SAnnotation anno : n.getAnnotations())
        {
          result.add(anno.getQName());
        }
      }
    }

    return result;
  }

  public static Set<String> getTokenAnnotationLevelSet(SaltProject p)
  {
    Set<String> result = new TreeSet<String>();

    for (SCorpusGraph corpusGraphs : p.getCorpusGraphs())
    {
      for (SDocument doc : corpusGraphs.getDocuments())
      {
        SDocumentGraph g = doc.getDocumentGraph();
        result.addAll(getTokenAnnotationLevelSet(g));
      }
    }

    return result;
  }

  /**
   * Gets the spannend/covered text for a token. This will get all
   * {@link STextualRelation} edges for a {@link SToken} from the
   * {@link SDocumentGraph} and calculates the appropiate substring from the
   * {@link STextualDS}.
   *
   * @param tok The {@link SToken} which is overlapping the text sequence.
   * @return An empty {@link String} object, if there is no
   * {@link STextualRelation}
   */
  public static String getSpannedText(SToken tok)
  {
    SDocumentGraph graph = tok.getGraph();

    List<SRelation<SNode,SNode>> edges = graph.getOutRelations(tok.getId());
    for (SRelation<? extends SNode,? extends SNode> e : edges)
    {
      if (e instanceof STextualRelation)
      {
        STextualRelation textRel = (STextualRelation) e;
        return textRel.getTarget().getText().substring(textRel.getStart(),
          textRel.getEnd());
      }
    }
    return "";
  }

  /**
   * Checks a {@link SNode} if it is member of a specific {@link SLayer}.
   *
   * @param layerName Specifies the layername to check.
   * @param node Specifies the node to check.
   * @return true - it is true when the name of layername corresponds to the
   * name of any label of the SNode.
   */
  public static boolean checkSLayer(String layerName, SNode node)
  {
    //robustness
    if (layerName == null || node == null)
    {
      return false;
    }

    Set<SLayer> sLayers = node.getLayers();
    if (sLayers != null)
    {
      for (SLayer l : sLayers)
      {
        Collection<Label> labels = l.getLabels();
        if (labels != null)
        {
          for (Label label : labels)
          {
            if (layerName.equals(label.getValue()))
            {
              return true;
            }
          }
        }
      }
    }

    return false;
  }

  public static List<String> getCorpusPath(SCorpusGraph corpusGraph,
    SDocument doc)
  {
    final List<String> result = new LinkedList<String>();

    result.add(doc.getName());
    SCorpus c = corpusGraph.getCorpus(doc);
    List<SNode> cAsList = new ArrayList<>();
    cAsList.add(c);
    corpusGraph.traverse(cAsList, GRAPH_TRAVERSE_TYPE.BOTTOM_UP_DEPTH_FIRST,
      "getRootCorpora",
      new GraphTraverseHandler()
      {
        @Override
        public void nodeReached(GRAPH_TRAVERSE_TYPE traversalType,
          String traversalId, SNode currNode, SRelation edge, SNode fromNode,
          long order)
        {
          result.add(currNode.getName());
        }

        @Override
        public void nodeLeft(GRAPH_TRAVERSE_TYPE traversalType,
          String traversalId,
          SNode currNode, SRelation edge, SNode fromNode, long order)
        {
        }

        @Override
        public boolean checkConstraint(GRAPH_TRAVERSE_TYPE traversalType,
          String traversalId, SRelation edge, SNode currNode, long order)
        {
          return true;
        }
      });
    return result;
  }

  public static List<String> getCorpusPath(URI uri)
  {
    String rawPath = StringUtils.strip(uri.getRawPath(), "/ \t");

    // split on raw path (so "/" in corpus names are still encoded)
    String[] path = rawPath.split("/");

    // decode every single part by itself
    ArrayList<String> result = new ArrayList<>(path.length);
    for (int i = 0; i < path.length; i++)
    {
      try
      {
        result.add(URLDecoder.decode(path[i], "UTF-8"));
      }
      catch (UnsupportedEncodingException ex)
      {
        log.error(null, ex);
        // fallback
        result.add(path[i]);
      }
    }

    return result;
  }

  /**
   * Finds the {@link STextualDS} for a given node. The node must dominate a
   * token of this text.
   *
   * @param node
   * @return
   */
  public static STextualDS getTextualDSForNode(SNode node, SDocumentGraph graph)
  {
    if (node != null)
    {
      List<DataSourceSequence> dataSources = graph.getOverlappedDataSourceSequence(
        node,
        SALT_TYPE.STEXT_OVERLAPPING_RELATION);
      if (dataSources != null)
      {
        for (DataSourceSequence seq : dataSources)
        {
          if (seq.getDataSource() instanceof STextualDS)
          {
            return (STextualDS) seq.getDataSource();
          }
        }
      }
    }
    return null;
  }

  /**
   * Returns a file name that is safe to use and does not have any invalid
   * characters.
   *
   * @param orig
   * @return
   */
  public static String getSafeFileName(String orig)
  {
    if (orig != null)
    {
      return orig.replaceAll("[^0-9A-Za-z-]", "_");
    }
    else
    {
      return UUID.randomUUID().toString();
    }
  }

  /**
   * Gets all names of a corpus from a salt project.
   *
   * @param p
   * @return returns an empty list if project is empty or null.
   */
  public static Set<String> getToplevelCorpusNames(SaltProject p)
  {
    Set<String> names = new HashSet<>();

    if (p != null && p.getCorpusGraphs() != null)
    {
      for (SCorpusGraph g : p.getCorpusGraphs())
      {
        if (g.getRoots() != null)
        {
          for (SNode c : g.getRoots())
          {
            names.add(c.getName());
          }
        }
      }
    }

    return names;
  }

  /**
   * Takes a map of salt node IDs to a value and return a new map that uses the
   * SNodes as keys instead of the IDs.
   *
   * @param <V>
   * @param map
   * @param graph
   * @return
   */
  public static <V> Map<SNode, V> createSNodeMapFromIDs(Map<String, V> map,
    SDocumentGraph graph)
  {
    HashMap<SNode, V> result = new LinkedHashMap<>();

    if (map != null && graph != null)
    {
      for (Map.Entry<String, V> e : map.entrySet())
      {
        SNode n = graph.getNode(e.getKey());
        if (n != null)
        {
          result.put(n, e.getValue());
        }
      }
    }

    return result;
  }
  
}