import java.io.File;
import java.io.FileNotFoundException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Scanner;
/**
* Consumptive document scanner. Capable of basic analysis. You may tokenize
* everything at startup, and use getTokens(), or you may request single,
* consumed tokens one at a time using getNextValidToken().
*
* @author Eric Fruchter
*
*/
public class TokenScanner {
/**
* Single line of file, latest grabbed from Scanner.
*/
private String latestLine = "";
/**
* The file being scanned.
*/
private File file;
/**
* The File reader for the document.
*/
private final Scanner fileScanner;
/**
* LEXICONS
*/
final static String az = "abcdefghijklmnopqrstuvwxyz";
final static String AZ = az.toUpperCase();
final static String digits = "0123456789";
final static String hyphen = "-";
final static String space = " ";
final static String dot = ".";
final static String punctuation = dot + ",!?";
final static String quote = "\"";
/**
* Load a document into the scanner.
*
* @param f
* the file.
* @throws FileNotFoundException
*/
public TokenScanner(final File f) throws FileNotFoundException {
fileScanner = new Scanner(file = f);
}
/**
* Consume a raw string token and return it.
*
* @return a raw token, if it exists.
*/
private Token nextRawToken() throws NoSuchElementException {
// Location of the char after the current projected token.
int sLoc = 0;
while (latestLine.trim().isEmpty()) {
latestLine = fileScanner.nextLine();
}
// Find the next end segment
while (latestLine.length() != sLoc + 1
&& latestLine.charAt(sLoc + 1) != ' ') {
// && getType(document.substring(0, sLoc + 1)) == TokenType.INVALID)
// {
sLoc++;
}
// End indices of the token, inclusive.
int endTestToken = sLoc;
/*
* Determine the longest sub-token by starting from the end and working
* backwards.
*/
boolean valid = false;
for (int i = endTestToken; i >= 0; i--) {
String testToken = latestLine.substring(0, i + 1);
if (getType(testToken) != TokenType.INVALID) {
sLoc = i;
valid = true;
break;
}
}
if (!valid) {
sLoc = 0;
}
String token = latestLine.substring(0, sLoc + 1);
Token t = new Token(token, getType(token));
runDiagnostic(t);
// Consume the token form the document
latestLine = latestLine.substring(sLoc + 1).trim();
return t;
}
/**
* Diagnostic info.
*/
private Token prevToken = new Token("INVALID", TokenType.INVALID);
final private HashMap<Token, Integer> VAR_FREQUENCY = new HashMap<Token, Integer>();
private int totalVARS = 0, totalINTS = 0, totalFLOATS = 0;
private Token minINT, maxINT, minFLOAT, maxFLOAT;
private int varFollowOccurrences = 0;
private boolean inQuote = false;
/**
* Retrieve the next valid token. INVALID tokens will be ignored.
*
* @return the next valid Token.
* @throws NoSuchElementException
* when no more tokens are left in document.
*/
public Token getNextValidToken() {
Token t = null;
do {
t = nextRawToken();
} while (t.TYPE == TokenType.INVALID);
return t;
}
/**
* Get a list of the most frequent tokens, in order of frequency. Each token
* has it's frequency field properly filled in.
*
* @param k
* top k occurences desired.
* @return Ordered list of tokens in order of frequency. Check the frequncy
* fields of each token for exact number of occurrences.
*/
public List<Token> getMostFrequentVARS(final int k) {
List<Token> f = new LinkedList<Token>();
for (Entry<Token, Integer> s : VAR_FREQUENCY.entrySet()) {
Token t = s.getKey();
t.frequency = s.getValue();
f.add(t);
}
Comparator<Token> c = new Comparator<Token>() {
@Override
public int compare(Token a, Token b) {
return new Integer(b.frequency).compareTo(new Integer(
a.frequency));
}
};
Collections.sort(f, c);
if (k > 0 && f.size() > k) {
List<Token> nf = new LinkedList<Token>();
for (int i = 0; i < k; i++) {
nf.add(f.get(i));
}
return nf;
}
return f;
}
public List<Token> getMostFrequentVARS() {
return getMostFrequentVARS(-1);
}
/**
* Get total number of INTS
*/
public int getTotalINTS() {
return totalINTS;
}
/**
* Get total number of FLOATS
*/
public int getTotalFLOATS() {
return totalFLOATS;
}
/**
* Get total number of VARS
*/
public int getTotalVARS() {
return totalVARS;
}
public Token getMinINT() {
return minINT;
}
public Token getMaxINT() {
return maxINT;
}
public Token getMinFLOAT() {
return minFLOAT;
}
public Token getMaxFLOAT() {
return maxFLOAT;
}
/**
* Get the amount of VARs followed immediately by INT or FLOAT.
*
*/
public int getVARFollowOccurrences() {
return varFollowOccurrences;
}
/**
* Get a List of all the quoted content in the document, unmodified.
*
* @return a possibly empty list of all quotes.
* @throws FileNotFoundException
*/
public List<String> getQuotes() throws FileNotFoundException {
List<String> quotes = new LinkedList<String>();
Scanner scan = new Scanner(file);
String curLine = "", rq = "";
boolean inQuote = false;
while (scan.hasNextLine()) {
curLine = scan.nextLine();
if (inQuote)
rq += "\n";
while (!curLine.isEmpty()) {
String ch = Character.toString(curLine.charAt(0));
curLine = curLine.substring(1);
if (quote.equals(ch)) {
if (inQuote) {
quotes.add(rq);
rq = "";
}
inQuote = !inQuote;
} else if (inQuote) {
rq += ch;
}
}
}
return quotes;
}
/**
* General document diagnostic events.
*
* @param token
*/
private void runDiagnostic(Token token) {
try {
if (token.TYPE == TokenType.INVALID) {
inQuote = !inQuote;
return;
}
// Update most popular var
if (token.TYPE == TokenType.VAR) {
if (!VAR_FREQUENCY.containsKey(token)) {
VAR_FREQUENCY.put(token, 1);
} else {
VAR_FREQUENCY.put(token, VAR_FREQUENCY.get(token) + 1);
}
}
// Keep track of how many of each type.
switch (token.TYPE) {
case INT:
totalINTS++;
if (minINT == null || maxINT == null) {
maxINT = minINT = token;
} else {
if (new BigInteger(token.TOKEN).compareTo(new BigInteger(
minINT.TOKEN)) < 0) {
minINT = token;
}
if (new BigInteger(token.TOKEN).compareTo(new BigInteger(
maxINT.TOKEN)) > 0) {
maxINT = token;
}
}
break;
case FLOAT:
totalFLOATS++;
if (minFLOAT == null || maxFLOAT == null) {
maxFLOAT = minFLOAT = token;
} else {
if (new BigDecimal(token.TOKEN).compareTo(new BigDecimal(
minFLOAT.TOKEN)) < 0) {
minFLOAT = token;
}
if (new BigDecimal(token.TOKEN).compareTo(new BigDecimal(
maxFLOAT.TOKEN)) > 0) {
maxFLOAT = token;
}
}
break;
case VAR:
totalVARS++;
break;
case INVALID:
default:
break;
}
/*
* Record occurrences of VARS followed by INTs of FLOATs.
*/
if (prevToken.TYPE == TokenType.VAR
&& (token.TYPE == TokenType.FLOAT || token.TYPE == TokenType.INT)) {
varFollowOccurrences++;
}
// Finally, register last used token.
prevToken = token;
} catch (NumberFormatException n) {
// The number is too large for ints
System.err
.println("\nEncountered INT/FLOAT that was too large for processing: "
+ token.TOKEN);
}
}
/**
* Attempt to parse the type of the given token.
*
* @param token
* the string token. Should have no spaces!
* @return the type of the token. Can be INT,VAR, FLOAT, or INVALID.
*/
public static TokenType getType(final String token) {
if (isFLOAT(token))
return TokenType.FLOAT;
if (isINT(token))
return TokenType.INT;
if (isVAR(token))
return TokenType.VAR;
return TokenType.INVALID;
}
/**
* VAR : Begins with a lowercase (a-z) or uppercase letter (A-Z) and is
* followed by one or more of the same or by a digit (0-9) or a hyphen (-)
*
* @param token
* a string.
* @return true if string is a
*/
public static boolean isVAR(final String token) {
// Check for empty or 1-size strings.
if (token.length() < 2)
return false;
// Verify first char is azAZ
String fChar = Character.toString(token.charAt(0));
if (!(az.contains(fChar) || AZ.contains(fChar))) {
return false;
}
// verify rest of chars are azAZ09-
String rChars = token.substring(1);
for (int i = 0; i < rChars.length(); i++) {
fChar = Character.toString(rChars.charAt(i));
if (!(az.contains(fChar) || AZ.contains(fChar)
|| digits.contains(fChar) || hyphen.contains(fChar))) {
return false;
}
}
// Passes all tests!
return true;
}
/**
* INT : Starts with a digit (0-9) followed by one or more of the same (0-9)
*
* @param token
* @return
*/
public static boolean isINT(final String token) {
// Check for empty or 1-size strings.
if (token.length() < 2)
return false;
// verify all chars are 0-9.
for (int i = 0; i < token.length(); i++) {
if (!digits.contains(Character.toString(token.charAt(i)))) {
return false;
}
}
// Passes all tests!
return true;
}
/**
* FLOAT: Starts with a digit (0-9) followed by zero or more of (0-9)
* followed by a dot (.) followed by zero or more of (0-9)
*
* @param token
* @return
*/
public static boolean isFLOAT(final String token) {
int indexOfDot = token.indexOf(dot);
// Check for empty or 1-size strings.
if (token.length() < 2 || indexOfDot <= 0)
return false;
// Check left side
String lChars = token.substring(0, indexOfDot);
// verify all chars are 0-9.
for (int i = 0; i < lChars.length(); i++) {
if (!digits.contains(Character.toString(lChars.charAt(i)))) {
return false;
}
}
// if right side exists, check it
if (indexOfDot != token.length() - 1) {
lChars = token.substring(indexOfDot + 1);
// verify all chars are 0-9.
for (int i = 0; i < lChars.length(); i++) {
if (!digits.contains(Character.toString(lChars.charAt(i)))) {
return false;
}
}
}
// Passes all tests!
return true;
}
/**
* Fully tokenize the entire document at once.
*/
public void tokenizeAll() {
try {
while (true) {
getNextValidToken();
}
} catch (NoSuchElementException done) {
// Fully Tokenized
}
}
}