import java.io.File; import java.io.FileNotFoundException; import java.math.BigDecimal; import java.math.BigInteger; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; import java.util.NoSuchElementException; import java.util.Scanner; /** * Consumptive document scanner. Capable of basic analysis. You may tokenize * everything at startup, and use getTokens(), or you may request single, * consumed tokens one at a time using getNextValidToken(). * * @author Eric Fruchter * */ public class TokenScanner { /** * Single line of file, latest grabbed from Scanner. */ private String latestLine = ""; /** * The file being scanned. */ private File file; /** * The File reader for the document. */ private final Scanner fileScanner; /** * LEXICONS */ final static String az = "abcdefghijklmnopqrstuvwxyz"; final static String AZ = az.toUpperCase(); final static String digits = "0123456789"; final static String hyphen = "-"; final static String space = " "; final static String dot = "."; final static String punctuation = dot + ",!?"; final static String quote = "\""; /** * Load a document into the scanner. * * @param f * the file. * @throws FileNotFoundException */ public TokenScanner(final File f) throws FileNotFoundException { fileScanner = new Scanner(file = f); } /** * Consume a raw string token and return it. * * @return a raw token, if it exists. */ private Token nextRawToken() throws NoSuchElementException { // Location of the char after the current projected token. int sLoc = 0; while (latestLine.trim().isEmpty()) { latestLine = fileScanner.nextLine(); } // Find the next end segment while (latestLine.length() != sLoc + 1 && latestLine.charAt(sLoc + 1) != ' ') { // && getType(document.substring(0, sLoc + 1)) == TokenType.INVALID) // { sLoc++; } // End indices of the token, inclusive. int endTestToken = sLoc; /* * Determine the longest sub-token by starting from the end and working * backwards. */ boolean valid = false; for (int i = endTestToken; i >= 0; i--) { String testToken = latestLine.substring(0, i + 1); if (getType(testToken) != TokenType.INVALID) { sLoc = i; valid = true; break; } } if (!valid) { sLoc = 0; } String token = latestLine.substring(0, sLoc + 1); Token t = new Token(token, getType(token)); runDiagnostic(t); // Consume the token form the document latestLine = latestLine.substring(sLoc + 1).trim(); return t; } /** * Diagnostic info. */ private Token prevToken = new Token("INVALID", TokenType.INVALID); final private HashMap<Token, Integer> VAR_FREQUENCY = new HashMap<Token, Integer>(); private int totalVARS = 0, totalINTS = 0, totalFLOATS = 0; private Token minINT, maxINT, minFLOAT, maxFLOAT; private int varFollowOccurrences = 0; private boolean inQuote = false; /** * Retrieve the next valid token. INVALID tokens will be ignored. * * @return the next valid Token. * @throws NoSuchElementException * when no more tokens are left in document. */ public Token getNextValidToken() { Token t = null; do { t = nextRawToken(); } while (t.TYPE == TokenType.INVALID); return t; } /** * Get a list of the most frequent tokens, in order of frequency. Each token * has it's frequency field properly filled in. * * @param k * top k occurences desired. * @return Ordered list of tokens in order of frequency. Check the frequncy * fields of each token for exact number of occurrences. */ public List<Token> getMostFrequentVARS(final int k) { List<Token> f = new LinkedList<Token>(); for (Entry<Token, Integer> s : VAR_FREQUENCY.entrySet()) { Token t = s.getKey(); t.frequency = s.getValue(); f.add(t); } Comparator<Token> c = new Comparator<Token>() { @Override public int compare(Token a, Token b) { return new Integer(b.frequency).compareTo(new Integer( a.frequency)); } }; Collections.sort(f, c); if (k > 0 && f.size() > k) { List<Token> nf = new LinkedList<Token>(); for (int i = 0; i < k; i++) { nf.add(f.get(i)); } return nf; } return f; } public List<Token> getMostFrequentVARS() { return getMostFrequentVARS(-1); } /** * Get total number of INTS */ public int getTotalINTS() { return totalINTS; } /** * Get total number of FLOATS */ public int getTotalFLOATS() { return totalFLOATS; } /** * Get total number of VARS */ public int getTotalVARS() { return totalVARS; } public Token getMinINT() { return minINT; } public Token getMaxINT() { return maxINT; } public Token getMinFLOAT() { return minFLOAT; } public Token getMaxFLOAT() { return maxFLOAT; } /** * Get the amount of VARs followed immediately by INT or FLOAT. * */ public int getVARFollowOccurrences() { return varFollowOccurrences; } /** * Get a List of all the quoted content in the document, unmodified. * * @return a possibly empty list of all quotes. * @throws FileNotFoundException */ public List<String> getQuotes() throws FileNotFoundException { List<String> quotes = new LinkedList<String>(); Scanner scan = new Scanner(file); String curLine = "", rq = ""; boolean inQuote = false; while (scan.hasNextLine()) { curLine = scan.nextLine(); if (inQuote) rq += "\n"; while (!curLine.isEmpty()) { String ch = Character.toString(curLine.charAt(0)); curLine = curLine.substring(1); if (quote.equals(ch)) { if (inQuote) { quotes.add(rq); rq = ""; } inQuote = !inQuote; } else if (inQuote) { rq += ch; } } } return quotes; } /** * General document diagnostic events. * * @param token */ private void runDiagnostic(Token token) { try { if (token.TYPE == TokenType.INVALID) { inQuote = !inQuote; return; } // Update most popular var if (token.TYPE == TokenType.VAR) { if (!VAR_FREQUENCY.containsKey(token)) { VAR_FREQUENCY.put(token, 1); } else { VAR_FREQUENCY.put(token, VAR_FREQUENCY.get(token) + 1); } } // Keep track of how many of each type. switch (token.TYPE) { case INT: totalINTS++; if (minINT == null || maxINT == null) { maxINT = minINT = token; } else { if (new BigInteger(token.TOKEN).compareTo(new BigInteger( minINT.TOKEN)) < 0) { minINT = token; } if (new BigInteger(token.TOKEN).compareTo(new BigInteger( maxINT.TOKEN)) > 0) { maxINT = token; } } break; case FLOAT: totalFLOATS++; if (minFLOAT == null || maxFLOAT == null) { maxFLOAT = minFLOAT = token; } else { if (new BigDecimal(token.TOKEN).compareTo(new BigDecimal( minFLOAT.TOKEN)) < 0) { minFLOAT = token; } if (new BigDecimal(token.TOKEN).compareTo(new BigDecimal( maxFLOAT.TOKEN)) > 0) { maxFLOAT = token; } } break; case VAR: totalVARS++; break; case INVALID: default: break; } /* * Record occurrences of VARS followed by INTs of FLOATs. */ if (prevToken.TYPE == TokenType.VAR && (token.TYPE == TokenType.FLOAT || token.TYPE == TokenType.INT)) { varFollowOccurrences++; } // Finally, register last used token. prevToken = token; } catch (NumberFormatException n) { // The number is too large for ints System.err .println("\nEncountered INT/FLOAT that was too large for processing: " + token.TOKEN); } } /** * Attempt to parse the type of the given token. * * @param token * the string token. Should have no spaces! * @return the type of the token. Can be INT,VAR, FLOAT, or INVALID. */ public static TokenType getType(final String token) { if (isFLOAT(token)) return TokenType.FLOAT; if (isINT(token)) return TokenType.INT; if (isVAR(token)) return TokenType.VAR; return TokenType.INVALID; } /** * VAR : Begins with a lowercase (a-z) or uppercase letter (A-Z) and is * followed by one or more of the same or by a digit (0-9) or a hyphen (-) * * @param token * a string. * @return true if string is a */ public static boolean isVAR(final String token) { // Check for empty or 1-size strings. if (token.length() < 2) return false; // Verify first char is azAZ String fChar = Character.toString(token.charAt(0)); if (!(az.contains(fChar) || AZ.contains(fChar))) { return false; } // verify rest of chars are azAZ09- String rChars = token.substring(1); for (int i = 0; i < rChars.length(); i++) { fChar = Character.toString(rChars.charAt(i)); if (!(az.contains(fChar) || AZ.contains(fChar) || digits.contains(fChar) || hyphen.contains(fChar))) { return false; } } // Passes all tests! return true; } /** * INT : Starts with a digit (0-9) followed by one or more of the same (0-9) * * @param token * @return */ public static boolean isINT(final String token) { // Check for empty or 1-size strings. if (token.length() < 2) return false; // verify all chars are 0-9. for (int i = 0; i < token.length(); i++) { if (!digits.contains(Character.toString(token.charAt(i)))) { return false; } } // Passes all tests! return true; } /** * FLOAT: Starts with a digit (0-9) followed by zero or more of (0-9) * followed by a dot (.) followed by zero or more of (0-9) * * @param token * @return */ public static boolean isFLOAT(final String token) { int indexOfDot = token.indexOf(dot); // Check for empty or 1-size strings. if (token.length() < 2 || indexOfDot <= 0) return false; // Check left side String lChars = token.substring(0, indexOfDot); // verify all chars are 0-9. for (int i = 0; i < lChars.length(); i++) { if (!digits.contains(Character.toString(lChars.charAt(i)))) { return false; } } // if right side exists, check it if (indexOfDot != token.length() - 1) { lChars = token.substring(indexOfDot + 1); // verify all chars are 0-9. for (int i = 0; i < lChars.length(); i++) { if (!digits.contains(Character.toString(lChars.charAt(i)))) { return false; } } } // Passes all tests! return true; } /** * Fully tokenize the entire document at once. */ public void tokenizeAll() { try { while (true) { getNextValidToken(); } } catch (NoSuchElementException done) { // Fully Tokenized } } }