/*
* Copyright 2011 SFB 632.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package annis;
import annis.model.AnnisConstants;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;
import org.apache.commons.lang3.StringUtils;
import org.corpus_tools.salt.SALT_TYPE;
import org.corpus_tools.salt.common.SCorpus;
import org.corpus_tools.salt.common.SCorpusGraph;
import org.corpus_tools.salt.common.SDocument;
import org.corpus_tools.salt.common.SDocumentGraph;
import org.corpus_tools.salt.common.SOrderRelation;
import org.corpus_tools.salt.common.STextualDS;
import org.corpus_tools.salt.common.STextualRelation;
import org.corpus_tools.salt.common.SToken;
import org.corpus_tools.salt.common.SaltProject;
import org.corpus_tools.salt.core.GraphTraverseHandler;
import org.corpus_tools.salt.core.SAnnotation;
import org.corpus_tools.salt.core.SFeature;
import org.corpus_tools.salt.core.SGraph.GRAPH_TRAVERSE_TYPE;
import org.corpus_tools.salt.core.SLayer;
import org.corpus_tools.salt.core.SNode;
import org.corpus_tools.salt.core.SRelation;
import org.corpus_tools.salt.graph.Label;
import org.corpus_tools.salt.util.DataSourceSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Utilities class for non-gui operations.
*
* @author Thomas Krause <krauseto@hu-berlin.de>
* @author Benjamin Weißenfels <b.pixeldrama@gmail.com>
*/
public class CommonHelper
{
private final static Logger log = LoggerFactory.getLogger(CommonHelper.class);
/**
* Detects arabic characters in a string.
*
* <p>
* Every character is checked, if its bit representation lies between:
* <code>[1425, 1785] | [64286, 65019] | [65136, 65276]</code>
*
* </p>
*
* @param str The string to be checked.
* @return returns true, if arabic characters are detected.
*/
public static boolean containsRTLText(String str)
{
if (str != null)
{
for (int i = 0; i < str.length(); i++)
{
char cc = str.charAt(i);
// hebrew extended and basic, arabic basic and extendend
if (cc >= 1425 && cc <= 1785)
{
return true;
}
// alphabetic presentations forms (hebrwew) to arabic presentation forms A
else if (cc >= 64286 && cc <= 65019)
{
return true;
}
// arabic presentation forms B
else if (cc >= 65136 && cc <= 65276)
{
return true;
}
}
}
return false;
}
/**
* Calculates a {@link SOrderRelation} node chain of a {@link SDocumentGraph}.
*
* <p>
* If no segmentation name is set, a list of sorted {@link SToken} will be
* returned.<p>
*
* @param segName The segmentation name, for which the chain is computed.
* @param graph The salt document graph, which is traversed for the
* segmentation.
*
* @return Returns a List of {@link SNode}, which is sorted by the
* {@link SOrderRelation}.
*/
public static List<SNode> getSortedSegmentationNodes(String segName,
SDocumentGraph graph)
{
List<SNode> token = new ArrayList<SNode>();
if (segName == null)
{
// if no segmentation is given just return the sorted token list
List<SToken> unsortedToken = graph.getSortedTokenByText();
if(unsortedToken != null)
{
token.addAll(unsortedToken);
}
}
else
{
// get the very first node of the order relation chain
Set<SNode> startNodes = new LinkedHashSet<SNode>();
for (SNode n : graph.getNodes())
{
SFeature feat
= n.getFeature(AnnisConstants.ANNIS_NS,
AnnisConstants.FEAT_FIRST_NODE_SEGMENTATION_CHAIN);
if (feat != null && segName.equalsIgnoreCase(feat.getValue_STEXT()))
{
startNodes.add(n);
}
}
Set<String> alreadyAdded = new HashSet<String>();
// add all nodes on the order relation chain beginning from the start node
for (SNode s : startNodes)
{
SNode current = s;
while (current != null)
{
token.add(current);
List<SRelation<SNode,SNode>> out = graph.getOutRelations(current.getId());
current = null;
if (out != null)
{
for (SRelation<? extends SNode,? extends SNode> e : out)
{
if (e instanceof SOrderRelation)
{
current = ((SOrderRelation) e).getTarget();
if (alreadyAdded.contains(current.getId()))
{
// abort if cycle detected
current = null;
}
else
{
alreadyAdded.add(current.getId());
}
break;
}
}
}
}
}
}
return token;
}
public static Set<String> getTokenAnnotationLevelSet(SDocumentGraph graph)
{
Set<String> result = new TreeSet<String>();
if (graph != null)
{
for (SToken n : graph.getTokens())
{
for (SAnnotation anno : n.getAnnotations())
{
result.add(anno.getQName());
}
}
}
return result;
}
public static Set<String> getTokenAnnotationLevelSet(SaltProject p)
{
Set<String> result = new TreeSet<String>();
for (SCorpusGraph corpusGraphs : p.getCorpusGraphs())
{
for (SDocument doc : corpusGraphs.getDocuments())
{
SDocumentGraph g = doc.getDocumentGraph();
result.addAll(getTokenAnnotationLevelSet(g));
}
}
return result;
}
/**
* Gets the spannend/covered text for a token. This will get all
* {@link STextualRelation} edges for a {@link SToken} from the
* {@link SDocumentGraph} and calculates the appropiate substring from the
* {@link STextualDS}.
*
* @param tok The {@link SToken} which is overlapping the text sequence.
* @return An empty {@link String} object, if there is no
* {@link STextualRelation}
*/
public static String getSpannedText(SToken tok)
{
SDocumentGraph graph = tok.getGraph();
List<SRelation<SNode,SNode>> edges = graph.getOutRelations(tok.getId());
for (SRelation<? extends SNode,? extends SNode> e : edges)
{
if (e instanceof STextualRelation)
{
STextualRelation textRel = (STextualRelation) e;
return textRel.getTarget().getText().substring(textRel.getStart(),
textRel.getEnd());
}
}
return "";
}
/**
* Checks a {@link SNode} if it is member of a specific {@link SLayer}.
*
* @param layerName Specifies the layername to check.
* @param node Specifies the node to check.
* @return true - it is true when the name of layername corresponds to the
* name of any label of the SNode.
*/
public static boolean checkSLayer(String layerName, SNode node)
{
//robustness
if (layerName == null || node == null)
{
return false;
}
Set<SLayer> sLayers = node.getLayers();
if (sLayers != null)
{
for (SLayer l : sLayers)
{
Collection<Label> labels = l.getLabels();
if (labels != null)
{
for (Label label : labels)
{
if (layerName.equals(label.getValue()))
{
return true;
}
}
}
}
}
return false;
}
public static List<String> getCorpusPath(SCorpusGraph corpusGraph,
SDocument doc)
{
final List<String> result = new LinkedList<String>();
result.add(doc.getName());
SCorpus c = corpusGraph.getCorpus(doc);
List<SNode> cAsList = new ArrayList<>();
cAsList.add(c);
corpusGraph.traverse(cAsList, GRAPH_TRAVERSE_TYPE.BOTTOM_UP_DEPTH_FIRST,
"getRootCorpora",
new GraphTraverseHandler()
{
@Override
public void nodeReached(GRAPH_TRAVERSE_TYPE traversalType,
String traversalId, SNode currNode, SRelation edge, SNode fromNode,
long order)
{
result.add(currNode.getName());
}
@Override
public void nodeLeft(GRAPH_TRAVERSE_TYPE traversalType,
String traversalId,
SNode currNode, SRelation edge, SNode fromNode, long order)
{
}
@Override
public boolean checkConstraint(GRAPH_TRAVERSE_TYPE traversalType,
String traversalId, SRelation edge, SNode currNode, long order)
{
return true;
}
});
return result;
}
public static List<String> getCorpusPath(URI uri)
{
String rawPath = StringUtils.strip(uri.getRawPath(), "/ \t");
// split on raw path (so "/" in corpus names are still encoded)
String[] path = rawPath.split("/");
// decode every single part by itself
ArrayList<String> result = new ArrayList<>(path.length);
for (int i = 0; i < path.length; i++)
{
try
{
result.add(URLDecoder.decode(path[i], "UTF-8"));
}
catch (UnsupportedEncodingException ex)
{
log.error(null, ex);
// fallback
result.add(path[i]);
}
}
return result;
}
/**
* Finds the {@link STextualDS} for a given node. The node must dominate a
* token of this text.
*
* @param node
* @return
*/
public static STextualDS getTextualDSForNode(SNode node, SDocumentGraph graph)
{
if (node != null)
{
List<DataSourceSequence> dataSources = graph.getOverlappedDataSourceSequence(
node,
SALT_TYPE.STEXT_OVERLAPPING_RELATION);
if (dataSources != null)
{
for (DataSourceSequence seq : dataSources)
{
if (seq.getDataSource() instanceof STextualDS)
{
return (STextualDS) seq.getDataSource();
}
}
}
}
return null;
}
/**
* Returns a file name that is safe to use and does not have any invalid
* characters.
*
* @param orig
* @return
*/
public static String getSafeFileName(String orig)
{
if (orig != null)
{
return orig.replaceAll("[^0-9A-Za-z-]", "_");
}
else
{
return UUID.randomUUID().toString();
}
}
/**
* Gets all names of a corpus from a salt project.
*
* @param p
* @return returns an empty list if project is empty or null.
*/
public static Set<String> getToplevelCorpusNames(SaltProject p)
{
Set<String> names = new HashSet<>();
if (p != null && p.getCorpusGraphs() != null)
{
for (SCorpusGraph g : p.getCorpusGraphs())
{
if (g.getRoots() != null)
{
for (SNode c : g.getRoots())
{
names.add(c.getName());
}
}
}
}
return names;
}
/**
* Takes a map of salt node IDs to a value and return a new map that uses the
* SNodes as keys instead of the IDs.
*
* @param <V>
* @param map
* @param graph
* @return
*/
public static <V> Map<SNode, V> createSNodeMapFromIDs(Map<String, V> map,
SDocumentGraph graph)
{
HashMap<SNode, V> result = new LinkedHashMap<>();
if (map != null && graph != null)
{
for (Map.Entry<String, V> e : map.entrySet())
{
SNode n = graph.getNode(e.getKey());
if (n != null)
{
result.put(n, e.getValue());
}
}
}
return result;
}
}