package edu.hawaii.jmotif.sequitur; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.StringTokenizer; import java.util.TreeSet; import java.util.concurrent.atomic.AtomicInteger; import edu.hawaii.jmotif.sax.SAXFactory; import edu.hawaii.jmotif.sax.alphabet.Alphabet; import edu.hawaii.jmotif.sax.alphabet.NormalAlphabet; import edu.hawaii.jmotif.sax.datastructures.SAXFrequencyData; import edu.hawaii.jmotif.text.SAXCollectionStrategy; import edu.hawaii.jmotif.text.TextUtils; import edu.hawaii.jmotif.text.WordBag; import edu.hawaii.jmotif.timeseries.TSException; import edu.hawaii.jmotif.timeseries.TSUtils; /** * Sort of a stand-alone factory to digesting strings with Sequitur. * * @author psenin * */ public class SequiturFactory { private static final double NORMALIZATION_THRESHOLD = 0.005; /** Alphabet instance we'll use. */ private static final Alphabet normalA = new NormalAlphabet(); /** * Digests a string of symbols separated by space. * * @param string The string to digest. Symbols expected to be separated by space. * * @return The top rule handler. */ public static synchronized SAXRule digest(String string) { // clear global collections // SAXSymbol.theDigrams.clear(); SAXRule.numRules = new AtomicInteger(0); SAXRule.indexes = new TreeSet<Integer>(); // SAXRule.arrayRuleStrings = new ArrayList<String>(); SAXRule.arrSAXRuleRecords = new ArrayList<SAXRuleRecord>(); // init the top-level rule // SAXRule resRule = new SAXRule(); // tokenize the input string // StringTokenizer st = new StringTokenizer(string, " "); // while there are tokens int currentPosition = 0; while (st.hasMoreTokens()) { String token = st.nextToken(); // System.out.println(token); // extract next token SAXTerminal symbol = new SAXTerminal(token, currentPosition); // append to the end of the current sequitur string // ... As each new input symbol is observed, append it to rule S.... resRule.last().insertAfter(symbol); // once appended, check if the resulting digram is new or recurrent // // ... Each time a link is made between two symbols if the new // digram is repeated elsewhere // and the repetitions do not overlap, if the other occurrence is a // complete rule, // replace the new digram with the non-terminal symbol that heads // the rule, // otherwise,form a new rule and replace both digrams with the new // non-terminal symbol // otherwise, insert the digram into the index... resRule.last().p.check(); currentPosition++; } return resRule; } public static List<WordBag> labeledSeries2WordBags(Map<String, List<double[]>> data, int paaSize, int alphabetSize, int windowSize, SAXCollectionStrategy strategy) throws IndexOutOfBoundsException, TSException { int[] params = new int[4]; params[0] = windowSize; params[1] = paaSize; params[2] = alphabetSize; params[3] = strategy.index(); // make a map of resulting bags Map<String, WordBag> preRes = new HashMap<String, WordBag>(); // int counter = 0; // process series one by one building word bags for (Entry<String, List<double[]>> e : data.entrySet()) { String classLabel = e.getKey(); WordBag bag = new WordBag(classLabel); for (double[] series : e.getValue()) { // System.out.println(counter); WordBag cb = seriesToWordBag("tmp", series, params); // System.out.println(cb.toString()); bag.mergeWith(cb); // counter++; } preRes.put(classLabel, bag); } List<WordBag> res = new ArrayList<WordBag>(); res.addAll(preRes.values()); return res; } private static synchronized WordBag seriesToWordBag(String label, double[] series, int[] params) throws IndexOutOfBoundsException, TSException { // the result WordBag resultBag = new WordBag(label); // parameters int windowSize = params[0]; int paaSize = params[1]; int alphabetSize = params[2]; SAXCollectionStrategy strategy = SAXCollectionStrategy.fromValue(params[3]); // the SAX data structure SAXFrequencyData saxFrequencyData = new SAXFrequencyData(); // need for numerosity reduction String previousString = ""; // scan across the time series extract sub sequences, and convert // them to strings for (int i = 0; i < series.length - (windowSize - 1); i++) { // fix the current subsection double[] subSection = Arrays.copyOfRange(series, i, i + windowSize); // Z normalize it if (TSUtils.stDev(subSection) > NORMALIZATION_THRESHOLD) { subSection = TSUtils.zNormalize(subSection); } // perform PAA conversion if needed double[] paa = TSUtils.optimizedPaa(subSection, paaSize); // Convert the PAA to a string. char[] currentString = TSUtils.ts2String(paa, normalA.getCuts(alphabetSize)); if (SAXCollectionStrategy.CLASSIC.equals(strategy)) { if (previousString.length() > 0 && SAXFactory.strDistance(currentString, previousString.toCharArray()) == 0) { continue; } } else if (SAXCollectionStrategy.EXACT.equals(strategy)) { if (previousString.equalsIgnoreCase(String.valueOf(currentString))) { continue; } } previousString = String.valueOf(currentString); saxFrequencyData.put(new String(currentString), i); } SAXRule rule = SequiturFactory.digest(saxFrequencyData.getSAXString(" ")); ArrayList<SAXRuleRecord> expandedRules = rule.getSAXRules(); for (SAXRuleRecord r : expandedRules) { if (0 == r.getRuleFrequency()) { continue; } resultBag.addWord(r.getExpandedRuleString(), r.getRuleFrequency()); } String displayString = rule.getGrammarDisplayString(); StringTokenizer st = new StringTokenizer(displayString, " "); // while there are tokens while (st.hasMoreTokens()) { String token = st.nextToken(); if (token.startsWith("R")) { continue; } resultBag.addWord(token); } return resultBag; } public static int classify(String classKey, double[] series, HashMap<String, HashMap<String, Double>> tfidf, int paaSize, int alphabetSize, int windowSize, SAXCollectionStrategy strategy) throws IndexOutOfBoundsException, TSException { int[] params = new int[4]; params[0] = windowSize; params[1] = paaSize; params[2] = alphabetSize; params[3] = strategy.index(); WordBag test = seriesToWordBag("test", series, params); // it is Cosine similarity, // // which ranges from 0.0 for the angle of 90 to 1.0 for the angle of 0 // i.e. LARGES value is a SMALLEST distance double minDist = Double.MIN_VALUE; String className = ""; double[] cosines = new double[tfidf.entrySet().size()]; int index = 0; for (Entry<String, HashMap<String, Double>> e : tfidf.entrySet()) { double dist = TextUtils.cosineSimilarity(test, e.getValue()); cosines[index] = dist; index++; if (dist > minDist) { className = e.getKey(); minDist = dist; } } // sometimes, due to the VECTORs specific layout, all values are the same, NEED to take care boolean allEqual = true; double cosine = cosines[0]; for (int i = 1; i < cosines.length; i++) { if (!(cosines[i] == cosine)) { allEqual = false; } } // report our findings if (!(allEqual) && className.equalsIgnoreCase(classKey)) { return 1; } // System.out.println("all equal " + allEqual + ", assigned to " + className + " instead of " + // classKey); return 0; } }