package at.lux.retrieval.suffixtreemodel; import at.lux.fotoretrieval.lucene.Relation; import at.lux.fotoretrieval.lucene.Graph; import at.lux.fotoretrieval.lucene.Path; import at.lux.fotoretrieval.lucene.GraphPathExtractor; import java.util.*; /** * <p/> * Date: 15.02.2006 <br> * Time: 20:31:45 <br> * Know-Center Graz, Inffeldgasse 21a, 8010 Graz, AUSTRIA <br> * * @author Mathias Lux, mlux@know-center.at */ public class SuffixTree { /** * Defines the way relations are handled in this tree. Either they are left out or they * are only used as types with no direction indicator, or they are used as they appear in * the path. */ public enum RelationType {NoRelations, UndirectedRelation, FullRelations} /** * Defines the relationType of similarity being used. */ public enum SimilarityType {Unweighted, TermFrequency, TFIDF, IDF } /** * Defines if paths should be added only in one direction or in both directions. */ public enum PathType {SingleDirection, BothDirections} private RelationType relationType = RelationType.NoRelations; private PathType pathType = PathType.SingleDirection; private SuffixTreeNode rootNode; private int docCount = 0; private int corpusDocCount = 0; public SuffixTree() { init(); } public SuffixTree(RelationType relationType) { this.relationType = relationType; init(); } private void init() { rootNode = new SuffixTreeNode("_root"); } /** * Adds a document with given id * @param text the document content */ public void addDocument(String text) { if (docCount<2) { docCount++; addDocument(text, docCount); } else { throw new UnsupportedOperationException("Only two documents can be added."); } } public void addDocument(String text, int id) { if (!(id == 1 || id == 2)) { throw new UnsupportedOperationException("This is not meant to work like this ..."); } String[] sentences = getSentences(text); for (int i = 0; i < sentences.length; i++) { List<String> tokens = getTokens(sentences[i]); while (tokens.size()>0) { rootNode.addSuffix(tokens, id); tokens.remove(0); } } } /** * Calculates and returns the silimilarity with default relationType * @return the similarity with default relationType. */ public double getSimilarity() { return getSimilarity(SimilarityType.Unweighted); } public double getSimilarity(SimilarityType type) { if (docCount!=2) { throw new UnsupportedOperationException("Excactly 2 documents have to be added!"); } double result = 0; if (type == SimilarityType.Unweighted) { HashMap<Integer, Integer> doc2docEdgeCount = new HashMap<Integer, Integer>(2); doc2docEdgeCount.put(-1,0); doc2docEdgeCount.put(1,0); doc2docEdgeCount.put(2,0); rootNode.getEdgesTraversed(doc2docEdgeCount); double bothDocsTraversed = ((double) doc2docEdgeCount.get(-1)); double doc1Traversed = ((double) doc2docEdgeCount.get(1)); double doc2Traversed = ((double) doc2docEdgeCount.get(2)); result = bothDocsTraversed / Math.max(doc2Traversed, doc1Traversed); } else if (type == SimilarityType.TermFrequency) { TermFrequencyWalker walker = new TermFrequencyWalker(); rootNode.traverseEdges(walker, SimilarityType.TermFrequency); result = walker.getSum() * (1d / (double) walker.getCountEdges()); } else if (type == SimilarityType.TFIDF || type == SimilarityType.IDF) { if (getSimilarity(SimilarityType.Unweighted)==1) return 1; TermFrequencyWalker walker = new TermFrequencyWalker(); if (corpusDocCount<1) throw new UnsupportedOperationException("For TF*IDF a corpus has to be applied to the Suffix Tree!"); walker.setCountCorpusDocuments(corpusDocCount); rootNode.traverseEdges(walker, type); result = walker.getSum() * (1d / (double) walker.getCountEdges()); } return result; } /** * Use this method to train a suffix tree for a corpus. This allows the usage of TF*IDF, * as the inverse document frequecy can be calculated. * @param text */ public void addCorpusDocument(String text) { corpusDocCount++; String[] sentences = getSentences(text); for (String sentence : sentences) { List<String> tokens = getTokens(sentence); while (!tokens.isEmpty()) { rootNode.prepareDocumentFrequency(tokens, corpusDocCount); tokens.remove(0); } } } protected String[] getSentences(String phrase) { String[] result = phrase.split("\\n"); return result; } protected List<String> getTokens(String sentence) { String[] result = sentence.split("\\s"); List<String> resultList = null; if (relationType == RelationType.NoRelations) { // strip all relations from the array: ArrayList<String> r = new ArrayList<String>(result.length / 2 + 1); for (int i = 0; i < result.length; i++) { String node = result[i]; // if the node is a number in square brackets if (node.matches("\\d+")) { r.add(node); } } resultList = r; } else if (relationType == RelationType.FullRelations) { ArrayList<String> r = new ArrayList<String>(result.length); for (int i = 0; i < result.length; i++) { String node = result[i]; // if the node is a number in square brackets if (node!=null) { r.add(node); } } resultList = r; } else if (relationType == RelationType.UndirectedRelation) { // strip all relations from the array: ArrayList<String> r = new ArrayList<String>(result.length); for (int i = 0; i < result.length; i++) { String node = result[i]; // if the node is a number in square brackets if (node != null && node.matches("\\d+")) { r.add(node); } else { // invert the relation if it is no key of the mapping table. if (!Relation.relationMapping.containsKey(node)) node = Relation.invertRelationType(node); r.add(node); } } resultList = r; } return resultList; } /** * Used to create a SuffixTree document from a Graph object * @param g the graph * @return a string representing the document built from the paths. */ public static String createSuffixTreeDocument(Graph g) { return createSuffixTreeDocument(g, PathType.SingleDirection, -1); } /** * Used to create a SuffixTree document from a Graph object * @param g the graph * @return a string representing the document built from the paths. */ public static String createSuffixTreeDocument(Graph g, PathType pathType, int maxLength) { Path[] paths = GraphPathExtractor.extractPaths(g.toString(), maxLength); HashSet<String> tmp = new HashSet<String>(paths.length); StringBuilder sb = new StringBuilder(256); for (int i = 0; i < paths.length; i++) { Path path = paths[i]; String pathString = path.toString(); if (!tmp.contains(pathString)) { sb.append(pathString.substring(1, pathString.length() - 1)); sb.append('\n'); tmp.add(pathString); } if (pathType == PathType.BothDirections) { pathString = path.toString(true); if (!tmp.contains(pathString)) { sb.append(pathString.substring(1, pathString.length() - 1)); sb.append('\n'); tmp.add(pathString); } } } return sb.toString(); } public static String createSuffixTreeDocument(Graph g, PathType pathType) { return createSuffixTreeDocument(g, pathType, -1); } /** * Used to create a SuffixTree document from a Graph object * @param g the graph * @return a string representing the document built from the paths. */ public static String createSuffixTreeDocument(Graph g, int maxLength) { return createSuffixTreeDocument(g, PathType.SingleDirection, maxLength); } /** * Allows to use the very same suffix tree for another similarity * calculation by deleting the inserted documents. */ public void resetSimilarity() { rootNode.resetSimilarity(); docCount = 0; } private static Path[] getPaths(Graph g, int maxLength) { ArrayList<Path> paths = new ArrayList<Path>(); HashSet<String> test = new HashSet<String>(); if (maxLength < 0) { Path[] tmpPaths = GraphPathExtractor.extractPaths(g.toString(), -1); for (int i = 0; i < tmpPaths.length; i++) { if (!test.contains(tmpPaths[i].toString())) { test.add(tmpPaths[i].toString()); paths.add(tmpPaths[i]); } else { // System.out.println("tmpPaths["+i+"] = " + tmpPaths[i].toString()); } } } else { for (int i = 0; i <= maxLength; i++) { Path[] tmpPaths = GraphPathExtractor.extractPaths(g.toString(), i); for (int j = 0; j < tmpPaths.length; j++) { if (!test.contains(tmpPaths[j].toString())) { test.add(tmpPaths[j].toString()); paths.add(tmpPaths[j]); } else { // System.out.println("tmpPaths["+j+"] = " + tmpPaths[j].toString()); } } } } return (Path[]) paths.toArray(new Path[1]); } }