package examples; import gov.sandia.cognition.text.algorithm.ValenceSpreader; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.Map; /** * This example shows how to run the ValenceSpreader code on a set of documents. * The document set is the movie review set (available for download from * https://www.cs.cornell.edu/people/pabo/movie-review-data/) and the starting * term scores are the ANEW set (available from * http://csea.phhp.ufl.edu/media/anewmessage.html if you're the right kind of * researcher). * * This code is meant as a fairly simple example of how to call the valence * spreader and different ways to call into it. This example is not intended as * the best possible results for learning a classifier for movie reviews (in * fact, the ANEW term lists appear to be very bad starting places to learn such * a classifier from). * * @author jdwendt */ public class ValenceAnewExample { /** * This is a simple DTO for the ANEW data (sentiment, arousal, and * dominance). */ private static class AnewData { /** * The sentiment score */ double sentiment; /** * The arousal score */ double arousal; /** * The dominance score */ double dominance; /** * Loads the input values into the correct data members. * * @param sentiment * @param arousal * @param dominance */ AnewData(double sentiment, double arousal, double dominance) { this.sentiment = sentiment; this.arousal = arousal; this.dominance = dominance; } } /** * Parses the CSV data at the input location into a map from terms to ANEW * scores. Note that this parses the ANEW CSV we've downloaded. If it * doesn't match your version, you may need to update this method. It * returns the word and the mean score for each of the sentiment, arousal, * and dominance scores. * * @param filename The file to parse * @return The data stored in the input file (for mean scores, not stdev) */ private static Map<String, AnewData> parseAnewFile(String filename) { Map<String, AnewData> ret = new HashMap<String, AnewData>(); try { BufferedReader br = new BufferedReader(new FileReader(filename)); String line = null; boolean firstline = true; while ((line = br.readLine()) != null) { // Skip the header if (firstline) { firstline = false; continue; } // It's a basic CSV (no tricks, escaped commas, etc.) String[] elements = line.split(","); if (elements.length != 9) { System.err.println("Found a line with wrong number of " + "columns: \"" + line + "\""); continue; } // Add the mean scores for each the word ret.put(elements[0], new AnewData( Double.parseDouble(elements[2]), Double.parseDouble( elements[4]), Double.parseDouble(elements[6]))); } br.close(); return ret; } catch (IOException ioe) { return null; } } /** * Returns the term frequency for each term in the document stored in the * input filename location. Note that this assumes the input file is in the * movie-review format (already lower-cased, spaces between all words and * even surrounding all punctuation). This includes punctuation as "terms". * * @param filename The file location to parse * @return All terms found in the document (key) and the number of times * they exist (value). */ private static Map<String, Double> getTermFrequency(String filename) { Map<String, Double> termFrequency = new HashMap<String, Double>(); try { BufferedReader br = new BufferedReader(new FileReader(filename)); String line = null; while ((line = br.readLine()) != null) { // these files are so pretty you can just split on spaces // and consider the punctuation their own "words" (that should // filter out in TF-IDF the same way "the" and "of" do) String[] words = line.split(" "); for (String word : words) { if (!termFrequency.containsKey(word)) { termFrequency.put(word, 0.0); } termFrequency.put(word, termFrequency.get(word) + 1); } } return termFrequency; } catch (IOException ioe) { throw new RuntimeException("Error reading file \"" + filename + "\"", ioe); } } /** * Reads all files in the input directory and returns a map containing the * filename (key) and the term frequencies (value) for all terms in each * document. Note that all documents' maps only include terms (key) and * counts (value) that were found in that document (no zero scores). * * @param path The path to the parent directory containing the documents to * parse. * @return A map containing all files' names found in the input directory * and their term frequencies. */ private static Map<String, Map<String, Double>> parseReviewFiles( String path) { File dir = new File(path); if (!dir.exists() || !dir.isDirectory()) { throw new RuntimeException("Although it looks like the base " + "diretory is there, the subdirectory \"" + path + "\" doesn't seem to exist."); } Map<String, Map<String, Double>> ret = new HashMap<String, Map<String, Double>>(); for (String filename : dir.list()) { ret.put(filename, getTermFrequency(path + filename)); } return ret; } /** * Loads all documents into the solver. This adds the documents with _no_ * scores assigned. * * @param solver The solver to load the documents into * @param documents The documents to add to the solver * @param useTermFrequency If true, term frequency is added, else only * binary exists/doesn't is added. */ private static void loadDocuments(ValenceSpreader<String, String> solver, Map<String, Map<String, Double>> documents, boolean useTermFrequency) { for (Map.Entry<String, Map<String, Double>> e : documents.entrySet()) { if (useTermFrequency) { solver.addDocumentTermWeights(e.getKey(), e.getValue()); } else { solver.addDocumentTermOccurrences(e.getKey(), e.getValue().keySet()); } } } /** * Simple enum for specifying which ANEW score to use as seed data */ private static enum AnewType { /** * The sentiment (happy/sad) scores */ SENTIMENT, /** * The arousal (excited/depressed) scores */ AROUSAL, /** * The dominance (in-control/controlled) scores */ DOMINANCE, /** * Don't add any scores for terms before solving (assumes some documents * were scored) */ NONE; } /** * The meat of this example. This code begins at the top with a set of * parameters you may wish to change to test how well the algorithm runs * with different input values. This is followed by some code that reads in * and loads the data, the code that runs the valence spreader, and then * finally some code to print the resulting confusion matrix. * * From our quick tests of this code, we found that all three of the ANEW * initial classifiers are very poor classifiers for movie reviews, while * the score is considerably better (~75% accuracy) with only 5% of the * documents labeled. * * @param args Ignored */ public static void main(String[] args) { // The path we will look for the ANEW data String anewFilename = "Data/anew-1999/all.csv"; // The path we will look for the movie review datasets String movieReviewsPath = "Data/review_polarity/txt_sentoken/"; // If true, Term Frequency will be loaded to the solver, else just // binary occurence boolean useTermFrequency = false; // The number of document scores to seed it with (which are randomly // chosen from the data). NOTE: All documents will be loaded to the // solver; this specifies how many labels to add to the data. int numSeedDocScores = 50; // Which feature to rank on AnewType whichScore = AnewType.NONE; // If zero, there's no middle group. Middle group separates false // postive from true positive by a middle group of "uncertain" double middleGroup = 0.0; // Offsets if the scores are not quite zero-centered // 0 if you think the center is properly set. double offset = -0.02; // The power maps more-or-less to the distance that a score spreads. // Too high, and the algorithm takes too long, and you won't like the // results int power = 8; // Get all of the ANEW-ranked terms // Our ANEW file is a very basic CSV file with the columns as follows: // 0 - term, 1 - Word No., 2 - Sentiment mean, 3 - Sentiment SD, // 4 - Arousal mean, 5 - Arousal SD, 6 - Dominance mean, 7 - Dom SD, // 8 - Word frequency // We parse the term and all of the means File anewFile = new File(anewFilename); if (!anewFile.exists()) { System.out.println("Can't run without the ANEW file. The ANEW\n" + "sentiment file is expected in a subdirectory of the current\n" + "directory as follows: " + anewFilename + "\n"); System.out.println("The ANEW dataset can be requested from \n" + "http://csea.phhp.ufl.edu/media/anewmessage.html. If you \n" + "are the right kind of researcher, they'll share it with \n" + "you."); return; } Map<String, AnewData> anewScores = parseAnewFile(anewFilename); // Get all of the positive and negative reviews (keep separate for // later scoring of the results) File movieReviewDir = new File(movieReviewsPath); if (!movieReviewDir.exists()) { System.out.println("Can't run without movie review files. These\n" + "are expected in a subdirectory of the current directory as\n" + "follows: " + movieReviewsPath + "\n"); System.out.println( "The movie review dataset can be downloaded from\n" + "https://www.cs.cornell.edu/people/pabo/movie-review-data/ \n" + "(we used polarity_dataset_v2.0 for testing this code)."); return; } Map<String, Map<String, Double>> positiveDocuments = parseReviewFiles( movieReviewsPath + "pos/"); Map<String, Map<String, Double>> negativeDocuments = parseReviewFiles( movieReviewsPath + "neg/"); // // BEGIN ACTUAL EXAMPLE CODE // ValenceSpreader<String, String> solver = new ValenceSpreader<String, String>(); // Load the documents in the solver // NOTE: These documents are loaded w/o scores attached. If you only // have one set of "unknown valence" documents, you would only call // loadDocuments once. loadDocuments(solver, negativeDocuments, useTermFrequency); loadDocuments(solver, positiveDocuments, useTermFrequency); // Load the correct term scores in the solver if (whichScore != AnewType.NONE) { for (Map.Entry<String, AnewData> e : anewScores.entrySet()) { switch (whichScore) { case SENTIMENT: solver.addWeightedTerm(e.getKey(), e.getValue().sentiment); break; case AROUSAL: solver.addWeightedTerm(e.getKey(), e.getValue().arousal); break; case DOMINANCE: solver.addWeightedTerm(e.getKey(), e.getValue().dominance); break; default: throw new RuntimeException("Unknown term type: " + whichScore + ". Expected 0 (sentiment), 1 " + "(arousal), or 2 (dominance)."); } } } // Load a random (as defined by HashMap) set of documents' scores (the // same number of postiive and negative) // These documents were already loaded above (all terms already in the // system). Here we're just telling the system how to score them. int cnt = 0; for (String docId : negativeDocuments.keySet()) { solver.addWeightedDocument(docId, -5); if ((++cnt > numSeedDocScores)) { break; } } cnt = 0; for (String docId : positiveDocuments.keySet()) { solver.addWeightedDocument(docId, 5); if ((++cnt > numSeedDocScores)) { break; } } // We need to balance the scores (as ANEW is 0..10, and we want // balanced around 0) solver.centerWeightsRange(); // Now we solve and look at the results! ValenceSpreader.Result<String, String> r = solver.spreadValence(power); // Now collect the four quadrants of the confusion matrix for positive // and negative with middleGroup as the wide splitting ground int[] confusionMatrix = { 0, 0, 0, 0, 0, 0 }; final int TRUE_POS = 0; final int FALSE_POS = 1; final int TRUE_NEG = 2; final int FALSE_NEG = 3; final int MIDDLE_POS = 4; final int MIDDLE_NEG = 5; for (String docId : negativeDocuments.keySet()) { if (r.documentWeights.get(docId) - offset < (-1 * middleGroup)) { ++confusionMatrix[TRUE_NEG]; } else if (r.documentWeights.get(docId) - offset > middleGroup) { ++confusionMatrix[FALSE_POS]; } else { ++confusionMatrix[MIDDLE_NEG]; } } for (String docId : positiveDocuments.keySet()) { if (r.documentWeights.get(docId) - offset > middleGroup) { ++confusionMatrix[TRUE_POS]; } else if (r.documentWeights.get(docId) - offset < (-1 * middleGroup)) { ++confusionMatrix[FALSE_NEG]; } else { ++confusionMatrix[MIDDLE_POS]; } } // NOTE: I'm not using the resulting word scores herein, but could save // them to a CSV file for use as a future classifier. System.out.println("Resulting confusion matrix:"); System.out.println(" | EXPECTED |"); System.out.println(" | POS | NEG |"); System.out.println("RES POS | " + confusionMatrix[TRUE_POS] + " | " + confusionMatrix[FALSE_POS] + " |"); System.out.println("RES MID | " + confusionMatrix[MIDDLE_POS] + " | " + confusionMatrix[MIDDLE_NEG] + " |"); System.out.println("RES NEG | " + confusionMatrix[FALSE_NEG] + " | " + confusionMatrix[TRUE_NEG] + " |"); } }