package edu.hawaii.jmotif.text;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import edu.hawaii.jmotif.sax.SAXFactory;
import edu.hawaii.jmotif.sax.alphabet.Alphabet;
import edu.hawaii.jmotif.sax.alphabet.NormalAlphabet;
import edu.hawaii.jmotif.timeseries.TSException;
import edu.hawaii.jmotif.timeseries.TSUtils;
/**
* Implements text statistics and mining utilities.
*
* @author psenin
*
*/
public final class TextUtils {
@SuppressWarnings("unused")
private static final String COMMA = ",";
private static final String CR = "\n";
private static final DecimalFormat df = new DecimalFormat("#0.00000");
private static final Alphabet a = new NormalAlphabet();
private TextUtils() {
assert true;
}
/**
* Computes TF*IDF values.
*
* @param texts The collection of text documents for which the statistics need to be computed.
* @return The map of source documents names to the word - tf*idf weight collections.
*/
public static synchronized HashMap<String, HashMap<String, Double>> computeTFIDF(
Collection<WordBag> texts) {
// the number of docs
int totalDocs = texts.size();
// the result. map of document names to the pairs word - tfidf weight
HashMap<String, HashMap<String, Double>> res = new HashMap<String, HashMap<String, Double>>();
// build a collection of all observed words and their frequency in corpus
HashMap<String, AtomicInteger> allWords = new HashMap<String, AtomicInteger>();
for (WordBag bag : texts) {
// here populate result map with empty entries
res.put(bag.getLabel(), new HashMap<String, Double>());
// and get those words
for (Entry<String, AtomicInteger> e : bag.getInternalWords().entrySet()) {
if (allWords.containsKey(e.getKey())) {
allWords.get(e.getKey()).incrementAndGet();
}
else {
allWords.put(e.getKey(), new AtomicInteger(1));
}
}
}
// outer loop - iterating over documents
for (WordBag bag : texts) {
// fix the doc name
String bagName = bag.getLabel();
HashMap<String, AtomicInteger> bagWords = bag.getInternalWords(); // these are words of
// documents
// what we want to do for TF*IDF is to compute it for all WORDS ever seen in set
//
for (Entry<String, AtomicInteger> word : allWords.entrySet()) {
// by default it is zero
//
double tfidf = 0;
// if this document contains the word - here we go
if (bagWords.containsKey(word.getKey()) & (totalDocs != word.getValue().intValue())) {
int wordInBagFrequency = bagWords.get(word.getKey()).intValue();
// compute TF: we take a log and correct for 0 by adding 1
// double tfValue = Math.log(1.0D + Integer.valueOf(wordInBagFrequency).doubleValue());
// double tfValue = 1.0D + Math.log(Integer.valueOf(wordInBagFrequency).doubleValue());
double tfValue = normalizedTF(bag, word.getKey());
// double tfValue = augmentedTF(bag, word.getKey());
// double tfValue = logAveTF(bag, word.getKey());
// compute the IDF
//
double idfLOGValue = Math.log10(Integer.valueOf(totalDocs).doubleValue() / word.getValue().doubleValue());
// and the TF-IDF
//
tfidf = tfValue * idfLOGValue;
}
res.get(bagName).put(word.getKey(), tfidf);
}
}
return res;
}
public static synchronized HashMap<String, HashMap<Bigram, Double>> computeTFIDF(List<BigramBag> bags) {
// the result. map of document names to the pairs word - tfidf weight
HashMap<String, HashMap<Bigram, Double>> res = new HashMap<String, HashMap<Bigram, Double>>();
// build a collection of all observed words
TreeSet<Bigram> allWords = new TreeSet<Bigram>();
for (BigramBag bag : bags) {
allWords.addAll(bag.getBigramSet());
}
// outer loop - iterating over documents
for (BigramBag bag : bags) {
// fix the doc name
String bagName = bag.getLabel();
HashMap<Bigram, Integer> bagWords = bag.getBigrams(); // these are words of documents
// what we want to do for TF*IDF is to compute it for all WORDS ever seen in set
//
for (Bigram word : allWords) {
// get the TF first
//
int wordFrequency = 0;
if (bagWords.containsKey(word)) {
wordFrequency = bagWords.get(word);
}
// TF = we take a log and correct for 0 by adding 1
double tfLOGValue = Math.log(Integer.valueOf(wordFrequency).doubleValue()
/ Integer.valueOf(bag.getTotalWordCount()).doubleValue() + 1.0D);
// double tfLOGValue = Math.log(Integer.valueOf(wordFrequency).doubleValue()) + 1.0D;
// compute the IDF
//
int totalDocs = bags.size();
int docsWithWord = 0;
for (BigramBag wb : bags) {
if (wb.contains(word)) {
docsWithWord = docsWithWord + 1;
}
}
double idfLOGValue = Math.log(Integer.valueOf(totalDocs).doubleValue()
/ Integer.valueOf(docsWithWord).doubleValue());
// and the TF-IDF
//
double tfIdf = tfLOGValue * idfLOGValue;
if (null == res.get(bagName)) {
res.put(bagName, new HashMap<Bigram, Double>());
}
res.get(bagName).put(word, tfIdf);
}
}
return res;
}
/**
* Compute TF (term frequency) metrics. This is logarithmically scaled TF.
*
* @param bag The words bag.
* @param term The term.
* @return The term frequency value.
*/
public static synchronized double logTF(WordBag bag, String term) {
if (bag.contains(term)) {
return 1.0d + Math.log(bag.getWordFrequency(term).doubleValue());
}
return 0d;
}
/**
* Compute TF (term frequency) metrics. This is normalized TF without bias towards longer
* documents.
*
* @param bag The words bag.
* @param term The term.
* @return The term frequency value.
*/
public static synchronized double normalizedTF(WordBag bag, String term) {
if (bag.contains(term)) {
return Integer.valueOf(bag.getWordFrequency(term)).doubleValue()
/ Integer.valueOf(bag.getMaxFrequency()).doubleValue();
}
return 0;
}
/**
* Compute TF (term frequency) metrics. This is normalized TF without bias towards longer
* documents.
*
* @param bag The words bag.
* @param term The term.
* @return The term frequency value.
*/
public static synchronized double augmentedTF(WordBag bag, String term) {
if (bag.contains(term)) {
return 0.5D + (Integer.valueOf(bag.getWordFrequency(term)).doubleValue())
/ (2.0D * Integer.valueOf(bag.getMaxFrequency()).doubleValue());
}
return 0;
}
/**
* Compute TF (term frequency) metrics. This is normalized TF without bias towards longer
* documents.
*
* @param bag The words bag.
* @param term The term.
* @return The term frequency value.
*/
public static synchronized double logAveTF(WordBag bag, String term) {
if (bag.contains(term)) {
return (1D + Math.log(Integer.valueOf(bag.getWordFrequency(term)).doubleValue()))
/ (1D + Math.log(bag.getAverageFrequency()));
}
return 0;
}
/**
* Compute document frequency, DF, metrics.
*
* @param bags The word bags collection.
* @param string The string term.
* @return The DF value.
*/
public static synchronized int df(HashMap<String, WordBag> bags, String string) {
int res = 0;
for (WordBag b : bags.values()) {
if (b.contains(string)) {
res += 1;
}
}
return res;
}
/**
* Compute idf (inverse document frequency) metrics.
*
* @param bags The bags of words collection.
* @param string The string (term).
* @return The idf value.
*/
public static synchronized double idf(HashMap<String, WordBag> bags, String string) {
return ((double)bags.size()) / ((double)df(bags, string));
}
public static synchronized String tfidfToTable(HashMap<String, HashMap<String, Double>> tfidf) {
// melt together sets of keys
//
TreeSet<String> words = new TreeSet<String>();
for (HashMap<String, Double> t : tfidf.values()) {
words.addAll(t.keySet());
}
// print keys - the dictionaries names
//
StringBuilder sb = new StringBuilder("\"\",");
for (String key : tfidf.keySet()) {
sb.append("\"").append(key).append("\",");
}
sb.delete(sb.length() - 1, sb.length()).append(CR);
// print rows, one by one
//
for (String w : words) {
int zeroCounter = 0;
StringBuffer rowSB = new StringBuffer();
rowSB.append("\"").append(w).append("\",");
for (String key : tfidf.keySet()) {
HashMap<String, Double> data = tfidf.get(key);
if (data.keySet().contains(w)) {
rowSB.append(data.get(w)).append(",");
if (data.get(w).equals(0D)) {
zeroCounter++;
}
}
else {
rowSB.append(df.format(0.0d)).append(",");
zeroCounter++;
}
}
rowSB.delete(rowSB.length() - 1, rowSB.length()).append("\n");
if (zeroCounter == tfidf.keySet().size()) {
continue;
}
else {
sb.append(rowSB.toString());
}
}
return sb.toString();
}
/**
* Normalize the vector to the norm of 1.
*
* @param vector the vector.
* @return normalized vector.
*/
public static synchronized HashMap<String, Double> normalizeToUnitVector(
HashMap<String, Double> vector) {
double sum = 0d;
for (double value : vector.values()) {
sum = sum + value * value;
}
sum = Math.sqrt(sum);
HashMap<String, Double> res = new HashMap<String, Double>();
for (Entry<String, Double> e : vector.entrySet()) {
res.put(e.getKey(), e.getValue() / sum);
}
return res;
}
/**
* Computes a cosine normalization of TFIDF statistics.
*
* @param data The data.
* @return The normalized tfidf statistics.
*/
public static synchronized HashMap<String, HashMap<String, Double>> normalizeToUnitVectors(
HashMap<String, HashMap<String, Double>> data) {
// result
HashMap<String, HashMap<String, Double>> res = new HashMap<String, HashMap<String, Double>>();
// cosine normalize these rows corresponding to docs TFIDF
//
for (Entry<String, HashMap<String, Double>> e : data.entrySet()) {
// normalization coefficient calculation
//
double sum = 0D;
for (double el : e.getValue().values()) {
if (!(0. == el)) {
sum = sum + el * el;
}
}
double sqRoot = Math.sqrt(sum);
// now all the elements must be divided by its value
HashMap<String, Double> newEntry = new HashMap<String, Double>(e.getValue().size());
for (Entry<String, Double> val : e.getValue().entrySet()) {
if (val.getValue().equals(0D)) {
newEntry.put(val.getKey(), 0D);
}
else {
newEntry.put(val.getKey(), val.getValue() / sqRoot);
}
}
// place it to map
res.put(e.getKey(), newEntry);
}
return res;
}
public static HashMap<String, HashMap<Bigram, Double>> normalizeBigramsToUnitVectors(
HashMap<String, HashMap<Bigram, Double>> data) {
// result
HashMap<String, HashMap<Bigram, Double>> res = new HashMap<String, HashMap<Bigram, Double>>();
// cosine normalize these rows corresponding to docs TFIDF
//
for (Entry<String, HashMap<Bigram, Double>> e : data.entrySet()) {
double sum = 0D;
for (Double el : e.getValue().values()) {
sum = sum + el * el;
}
double sqRoot = Math.sqrt(sum);
HashMap<Bigram, Double> newEntry = new HashMap<Bigram, Double>();
for (Entry<Bigram, Double> val : e.getValue().entrySet()) {
double newValue = val.getValue() / sqRoot;
newEntry.put(val.getKey(), newValue);
}
res.put(e.getKey(), newEntry);
}
return res;
}
/**
* Computes a cosine similarity.
*
* @param data1 The data vector 1.
* @param data2 The data vector 2.
* @return The cosine distance.
*/
public static synchronized double cosineDistance(HashMap<String, Double> data1,
HashMap<String, Double> data2) {
// sanity word order check
if (!(data2.keySet().containsAll(data1.keySet()))
|| !(data2.keySet().size() == data1.keySet().size())) {
throw new RuntimeException("COSINE SIMILARITY ERROR: word sets are different in length!");
}
double[] vector1 = new double[data1.size()];
double[] vector2 = new double[data2.size()];
int i = 0;
for (String s : data1.keySet()) {
vector1[i] = data1.get(s);
vector2[i] = data2.get(s);
i++;
}
double numerator = dotProduct(vector1, vector2);
double denominator = magnitude(vector1) * magnitude(vector2);
return numerator / denominator;
}
public static synchronized double cosineSimilarity(WordBag testSample,
HashMap<String, Double> weightVector) {
double res = 0;
for (Entry<String, Integer> entry : testSample.getWords().entrySet()) {
if (weightVector.containsKey(entry.getKey())) {
res = res + entry.getValue().doubleValue() * weightVector.get(entry.getKey()).doubleValue();
}
}
double m1 = magnitude(testSample.getWordsAsDoubles().values());
double m2 = magnitude(weightVector.values());
return res / (m1 * m2);
}
private static double cosineSimilarity(BigramBag testSample, HashMap<Bigram, Double> weightVector) {
double res = 0;
for (Entry<Bigram, Integer> entry : testSample.getBigrams().entrySet()) {
if (weightVector.containsKey(entry.getKey())) {
res = res + entry.getValue().doubleValue() * weightVector.get(entry.getKey()).doubleValue();
}
}
double m1 = magnitude(testSample.getBigramsAsDoubles().values());
double m2 = magnitude(weightVector.values());
return res / (m1 * m2);
}
public static synchronized CosineDistanceMatrix getCosineDistanceMatrix(
HashMap<String, HashMap<String, Double>> tfidf) {
CosineDistanceMatrix res = new CosineDistanceMatrix(tfidf);
return res;
}
/**
* Compute the magnitude of the vector.
*
* @param vector The vector.
* @return The magnitude.
*/
public static synchronized double magnitude(double[] vector) {
return Math.sqrt(dotProduct(vector, vector));
}
/**
* Compute the magnitude of the vector.
*
* @param vector The vector.
* @return The magnitude.
*/
public static synchronized double magnitude(Double[] vector) {
return Math.sqrt(dotProduct(vector, vector));
}
private static synchronized double magnitude(Collection<Double> values) {
Double res = 0.0D;
for (Double v : values) {
res = res + v * v;
}
return Math.sqrt(res.doubleValue());
}
/**
* Compute the dot product of two vectors.
*
* @param vector1 The vector 1.
* @param vector2 The vector 2.
* @return The dot product.
*/
public static synchronized double dotProduct(double[] vector1, double[] vector2) {
double res = 0.0D;
for (int i = 0; i < vector1.length; i++) {
res = res + vector1[i] * vector2[i];
}
return res;
}
/**
* Compute the dot product of two vectors.
*
* @param vector1 The vector 1.
* @param vector2 The vector 2.
* @return The dot product.
*/
public static synchronized double dotProduct(Double[] vector1, Double[] vector2) {
double res = 0.0D;
for (int i = 0; i < vector1.length; i++) {
res = res + vector1[i] * vector2[i];
}
return res;
}
public static synchronized WordBag seriesToWordBag(String label, double[] series, int[] params)
throws IndexOutOfBoundsException, TSException {
WordBag resultBag = new WordBag(label);
int windowSize = params[0];
int paaSize = params[1];
int alphabetSize = params[2];
SAXCollectionStrategy strategy = SAXCollectionStrategy.fromValue(params[3]);
// System.out.println("Strategy: " + strategy.index());
String oldStr = "";
for (int i = 0; i <= series.length - windowSize; i++) {
double[] paa = TSUtils.optimizedPaa(
TSUtils.zNormalize(TSUtils.subseries(series, i, windowSize)), paaSize);
char[] sax = TSUtils.ts2String(paa, a.getCuts(alphabetSize));
if (SAXCollectionStrategy.CLASSIC.equals(strategy)) {
if (oldStr.length() > 0 && SAXFactory.strDistance(sax, oldStr.toCharArray()) == 0) {
continue;
}
}
else if (SAXCollectionStrategy.EXACT.equals(strategy)) {
if (oldStr.equalsIgnoreCase(String.valueOf(sax))) {
continue;
}
}
oldStr = String.valueOf(sax);
resultBag.addWord(String.valueOf(sax));
}
return resultBag;
}
protected static synchronized BigramBag seriesToBigramBag(String label, double[] series,
int[][] params) throws TSException {
BigramBag resultBag = new BigramBag(label);
for (int[] p : params) {
ArrayList<String> text = new ArrayList<String>();
int windowSize = p[0];
int paaSize = p[1];
int alphabetSize = p[2];
SAXCollectionStrategy strategy = SAXCollectionStrategy.fromValue(p[3]);
String oldStr = "";
for (int i = 0; i <= series.length - windowSize; i++) {
double[] paa = TSUtils.optimizedPaa(
TSUtils.zNormalize(TSUtils.subseries(series, i, windowSize)), paaSize);
char[] sax = TSUtils.ts2String(paa, a.getCuts(alphabetSize));
// System.out.println(Arrays.toString(TSUtils.subseries(series, i, windowSize)) + "->"
// + Arrays.toString(paa));
if (SAXCollectionStrategy.CLASSIC.equals(strategy)) {
if (oldStr.length() > 0 && SAXFactory.strDistance(sax, oldStr.toCharArray()) == 0) {
continue;
}
}
else if (SAXCollectionStrategy.EXACT.equals(strategy)) {
if (oldStr.equalsIgnoreCase(String.valueOf(sax))) {
continue;
}
}
oldStr = String.valueOf(sax);
text.add(String.valueOf(sax));
}
// need to text into bigrams
//
Bigram cBigram = new Bigram();
for (String str : text) {
cBigram.setNext(str);
if (cBigram.isComplete()) {
resultBag.add(cBigram);
cBigram = new Bigram();
cBigram.setNext(str);
}
}
}
return resultBag;
}
public static synchronized List<WordBag> labeledSeries2WordBags(Map<String, List<double[]>> data,
int paaSize, int alphabetSize, int windowSize, SAXCollectionStrategy strategy)
throws IndexOutOfBoundsException, TSException {
int[] params = new int[4];
params[0] = windowSize;
params[1] = paaSize;
params[2] = alphabetSize;
params[3] = strategy.index();
return labeledSeries2WordBags(data, params);
}
/**
* Converts timeseries datastructure into the bag of words. It is assumed that every key in the
* parameters map is the timeseries class label. The corresponding list of double arrays, is a set
* of representatives of this class.
*
* @param data The map of class labels and representatives.
* @param params The set of SAX parameters to use, index 0 - sliding window size, index 1 - PAA
* size, index 2 - alphabet size.
* @return The words bag.
* @throws IndexOutOfBoundsException If error occurs.
* @throws TSException If error occurs.
*/
public static synchronized List<WordBag> labeledSeries2WordBags(Map<String, List<double[]>> data,
int[] params) throws IndexOutOfBoundsException, TSException {
// make a map of resulting bags
Map<String, WordBag> preRes = new HashMap<String, WordBag>();
// process series one by one building word bags
for (Entry<String, List<double[]>> e : data.entrySet()) {
String classLabel = e.getKey();
WordBag bag = new WordBag(classLabel);
for (double[] series : e.getValue()) {
WordBag cb = seriesToWordBag("tmp", series, params);
bag.mergeWith(cb);
}
preRes.put(classLabel, bag);
}
List<WordBag> res = new ArrayList<WordBag>();
res.addAll(preRes.values());
return res;
}
public static synchronized List<WordBag> labeledMultivariateSeries2WordBags(
Map<String, List<double[][]>> data, int[] params) throws IndexOutOfBoundsException,
TSException {
// make a summary map
Map<String, WordBag> preRes = new HashMap<String, WordBag>();
for (String tag : data.keySet()) {
preRes.put(tag, new WordBag(tag));
}
// process series one by one building word bags
for (Entry<String, List<double[][]>> e : data.entrySet()) {
String seriesLabel = e.getKey();
WordBag bag = preRes.get(seriesLabel);
for (double[][] series : e.getValue()) {
for (double[] currSeries : series) {
WordBag cb = seriesToWordBag("tmp", currSeries, params);
bag.mergeWith(cb);
}
}
}
List<WordBag> res = new ArrayList<WordBag>();
res.addAll(preRes.values());
return res;
}
public static synchronized List<BigramBag> labeledSeries2BigramBags(
Map<String, List<double[]>> data, int[][] params) throws IndexOutOfBoundsException,
TSException {
// make a map of resulting bags
Map<String, BigramBag> preRes = new HashMap<String, BigramBag>();
for (String tag : data.keySet()) {
preRes.put(tag, new BigramBag(tag));
}
// process series one by one building word bags
for (Entry<String, List<double[]>> e : data.entrySet()) {
String seriesLabel = e.getKey();
BigramBag bag = preRes.get(seriesLabel);
for (double[] series : e.getValue()) {
BigramBag cb = seriesToBigramBag("tmp", series, params);
bag.mergeWith(cb);
}
}
List<BigramBag> res = new ArrayList<BigramBag>();
res.addAll(preRes.values());
return res;
}
public static synchronized int classify(String classKey, double[] series,
HashMap<String, HashMap<String, Double>> tfidf, int paaSize, int alphabetSize,
int windowSize, SAXCollectionStrategy strategy) throws IndexOutOfBoundsException, TSException {
int[] params = new int[4];
params[0] = windowSize;
params[1] = paaSize;
params[2] = alphabetSize;
params[3] = strategy.index();
return classify(classKey, series, tfidf, params);
}
/**
* Performs classification.
*
* @param classKey The target class key, if series will appear of this class, returns 1.
* @param series The series to test.
* @param tfidf The TF*IDF weights data structure.
* @param params SAX parameters to use.
* @return 1 if the series vector aligns with a class vector, or 0 otherwise.
*
* @throws IndexOutOfBoundsException
* @throws TSException
*/
public static synchronized int classify(String classKey, double[] series,
HashMap<String, HashMap<String, Double>> tfidf, int[] params)
throws IndexOutOfBoundsException, TSException {
WordBag test = seriesToWordBag("test", series, params);
// it is Cosine similarity,
//
// which ranges from 0.0 for the angle of 90 to 1.0 for the angle of 0
// i.e. LARGES value is a SMALLEST distance
double minDist = Double.MIN_VALUE;
String className = "";
double[] cosines = new double[tfidf.entrySet().size()];
int index = 0;
for (Entry<String, HashMap<String, Double>> e : tfidf.entrySet()) {
double dist = TextUtils.cosineSimilarity(test, e.getValue());
cosines[index] = dist;
index++;
if (dist > minDist) {
className = e.getKey();
minDist = dist;
}
}
// sometimes, due to the VECTORs specific layout, all values are the same, NEED to take care
boolean allEqual = true;
double cosine = cosines[0];
for (int i = 1; i < cosines.length; i++) {
if (!(cosines[i] == cosine)) {
allEqual = false;
}
}
// report our findings
if (!(allEqual) && className.equalsIgnoreCase(classKey)) {
return 1;
}
// System.out.println("all equal " + allEqual + ", assigned to " + className + " instead of " +
// classKey);
return 0;
}
public static synchronized String classify(WordBag test, HashMap<String, HashMap<String, Double>> tfidf) {
// it is Cosine similarity,
//
// which ranges from 0.0 for the angle of 90 to 1.0 for the angle of 0
// i.e. LARGES value is a SMALLEST distance
double minDist = Double.MIN_VALUE;
String className = "";
double[] cosines = new double[tfidf.entrySet().size()];
int index = 0;
for (Entry<String, HashMap<String, Double>> e : tfidf.entrySet()) {
double dist = TextUtils.cosineSimilarity(test, e.getValue());
cosines[index] = dist;
index++;
if (dist > minDist) {
className = e.getKey();
minDist = dist;
}
}
return className;
}
public static synchronized int classifyBigrams(String classKey, double[] series,
HashMap<String, HashMap<Bigram, Double>> tfidf, int[][] params) throws TSException {
BigramBag test = seriesToBigramBag("test", series, params);
double minDist = -1.0d;
String className = "";
double[] cosines = new double[tfidf.entrySet().size()];
int index = 0;
for (Entry<String, HashMap<Bigram, Double>> e : tfidf.entrySet()) {
double dist = TextUtils.cosineSimilarity(test, e.getValue());
cosines[index] = dist;
index++;
if (dist > minDist) {
className = e.getKey();
minDist = dist;
}
}
boolean allEqual = true;
double cosine = cosines[0];
for (int i = 1; i < cosines.length; i++) {
if (!(cosines[i] == cosine)) {
allEqual = false;
}
}
if (!(allEqual) && className.equalsIgnoreCase(classKey)) {
return 1;
}
return 0;
}
public static synchronized int classify(String classKey, double[][] data,
HashMap<String, HashMap<String, Double>> tfidf, int[][] params)
throws IndexOutOfBoundsException, TSException {
WordBag test = new WordBag("test");
for (int[] p : params) {
int windowSize = p[0];
int paaSize = p[1];
int alphabetSize = p[2];
SAXCollectionStrategy strategy = SAXCollectionStrategy.fromValue(p[3]);
String oldStr = "";
for (double[] series : data) {
for (int j = 0; j <= series.length - windowSize; j++) {
double[] paa = TSUtils.optimizedPaa(
TSUtils.zNormalize(TSUtils.subseries(series, j, windowSize)), paaSize);
char[] sax = TSUtils.ts2String(paa, a.getCuts(alphabetSize));
if (SAXCollectionStrategy.CLASSIC.equals(strategy)) {
if (oldStr.length() > 0 && SAXFactory.strDistance(sax, oldStr.toCharArray()) == 0) {
continue;
}
}
else if (SAXCollectionStrategy.EXACT.equals(strategy)) {
if (oldStr.equalsIgnoreCase(String.valueOf(sax))) {
continue;
}
}
oldStr = String.valueOf(sax);
test.addWord(String.valueOf(sax));
}
}
}
double minDist = -1.0d;
String className = "";
double[] cosines = new double[tfidf.entrySet().size()];
int index = 0;
for (Entry<String, HashMap<String, Double>> e : tfidf.entrySet()) {
double dist = TextUtils.cosineSimilarity(test, e.getValue());
cosines[index] = dist;
index++;
if (dist > minDist) {
className = e.getKey();
minDist = dist;
}
}
boolean allEqual = true;
double cosine = cosines[0];
for (int i = 1; i < cosines.length; i++) {
if (!(cosines[i] == cosine)) {
allEqual = false;
}
}
if (!(allEqual) && className.equalsIgnoreCase(classKey)) {
return 1;
}
return 0;
}
public static synchronized String wordBagToTable(WordBag bag) {
TreeSet<String> words = new TreeSet<String>();
words.addAll(bag.getWordSet());
// name
//
StringBuilder sb = new StringBuilder("\"" + bag.getLabel() + "\"").append(CR);
// print rows, one by one
//
for (String w : words) {
Integer count = bag.getWordFrequency(w);
if (count == 0) {
continue;
}
sb.append("\"").append(w).append("\",");
sb.append(count).append(CR);
}
return sb.toString();
}
public static synchronized String bagsToTable(List<WordBag> bags) {
// melt together sets of keys
//
TreeSet<String> words = new TreeSet<String>();
for (WordBag bag : bags) {
words.addAll(bag.getWordSet());
}
// print keys - the dictionaries names
//
LinkedHashMap<String, Integer> bagKeys = new LinkedHashMap<String, Integer>();
StringBuilder sb = new StringBuilder("\"\",");
int index = 0;
for (WordBag bag : bags) {
bagKeys.put(bag.getLabel(), index);
index++;
sb.append("\"").append(bag.getLabel()).append("\",");
}
sb.delete(sb.length() - 1, sb.length()).append(CR);
// print rows, one by one
//
for (String w : words) {
int zeroCounter = 0;
StringBuffer rowSB = new StringBuffer();
rowSB.append("\"").append(w).append("\",");
for (Entry<String, Integer> bagKey : bagKeys.entrySet()) {
WordBag bag = bags.get(bagKey.getValue());
HashMap<String, Integer> data = bag.getWords();
if (data.keySet().contains(w)) {
rowSB.append(data.get(w)).append(",");
if (data.get(w).equals(0)) {
zeroCounter++;
}
}
else {
rowSB.append(0).append(",");
zeroCounter++;
}
}
rowSB.delete(rowSB.length() - 1, rowSB.length()).append("\n");
if (zeroCounter == bags.size()) {
continue;
}
else {
sb.append(rowSB.toString());
}
}
return sb.toString();
}
}