package edu.hawaii.jmotif.performance.digits; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import cc.mallet.util.Randoms; import edu.hawaii.jmotif.performance.UCRGenericClassifier; import edu.hawaii.jmotif.performance.UCRUtils; import edu.hawaii.jmotif.text.SAXCollectionStrategy; import edu.hawaii.jmotif.text.TextUtils; import edu.hawaii.jmotif.text.WordBag; /** * Helper-runner for test. * * @author psenin * */ public class UCRdigitsWebProper extends UCRGenericClassifier { // data locations // private static final String TRAINING_DATA = "data/digits/digits_normalized_reduced_200.csv"; private static final String TEST_DATA = "data/digits/test.csv"; // SAX parameters to try // private static final int[][] params = { { 194, 15, 5, EXACT },}; private static final Randoms randoms = new Randoms(); /** * Runnable. * * @throws Exception if error occurs. */ public static void main(String[] args) throws Exception { // making training and test collections // Map<String, List<double[]>> trainData = UCRUtils.readUCRData(TRAINING_DATA); List<double[]> testData = readTestData(TEST_DATA); BufferedWriter bw = new BufferedWriter(new FileWriter(new File( "data/digits/third_try_1000.csv"))); bw.write("ImageId,Label\n"); // iterate over parameters // for (int[] p : params) { // converting back from easy encoding int WINDOW_SIZE = p[0]; int PAA_SIZE = p[1]; int ALPHABET_SIZE = p[2]; SAXCollectionStrategy strategy = SAXCollectionStrategy.CLASSIC; if (EXACT == p[3]) { strategy = SAXCollectionStrategy.EXACT; } else if (NOREDUCTION == p[3]) { strategy = SAXCollectionStrategy.NOREDUCTION; } // making training bags collection List<WordBag> bags = TextUtils.labeledSeries2WordBags(trainData, PAA_SIZE, ALPHABET_SIZE, WINDOW_SIZE, strategy); // getting TFIDF done HashMap<String, HashMap<String, Double>> tfidf = TextUtils.computeTFIDF(bags); int seriesCounter = 1; for (double[] series : testData) { WordBag test = TextUtils.seriesToWordBag("test", series, params[0]); // it is Cosine similarity, // // which ranges from 0.0 for the angle of 90 to 1.0 for the angle of 0 // i.e. LARGES value is a SMALLEST distance double minDist = Double.MIN_VALUE; String className = ""; double[] cosines = new double[tfidf.entrySet().size()]; int index = 0; for (Entry<String, HashMap<String, Double>> e : tfidf.entrySet()) { double dist = TextUtils.cosineSimilarity(test, e.getValue()); cosines[index] = dist; index++; if (dist > minDist) { className = e.getKey(); minDist = dist; } } // sometimes, due to the VECTORs specific layout, all values are the same, NEED to take care boolean allEqual = true; double cosine = cosines[0]; for (int i = 1; i < cosines.length; i++) { if (!(cosines[i] == cosine)) { allEqual = false; } } // report our findings if (!(allEqual)) { System.out.println(seriesCounter + ":" + className); bw.write(seriesCounter + "," + className + "\n"); } else { long guess = Math.round(randoms.nextUniform(-0.49999, 9.49999)); System.out.println(seriesCounter + ":" + guess); bw.write(seriesCounter + "," + guess + "\n"); } seriesCounter++; } } bw.close(); } private static List<double[]> readTestData(String fileName) throws NumberFormatException, IOException { List<double[]> res = new ArrayList<double[]>(); BufferedReader br = new BufferedReader(new FileReader(new File(fileName))); String line = ""; while ((line = br.readLine()) != null) { if (line.trim().length() == 0) { continue; } String[] split = line.trim().split(",|\\s+"); double[] series = new double[split.length]; for (int i = 0; i < split.length; i++) { series[i] = Double.valueOf(split[i].trim()).doubleValue(); } res.add(series); } br.close(); return res; } private static Double parseValue(String string) { Double res = Double.NaN; try { Double r = Double.valueOf(string); res = r; } catch (NumberFormatException e) { assert true; } return res; } }