import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.StreamTokenizer; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Random; public class Main { static KeywordExtractor kwe; static HashMap<String, Long> javaKeys; public static void init(String s) { kwe = KeywordExtractor.getInstance(); javaKeys = new HashMap<String, Long>(); try{ File f = new File(s); BufferedReader br = new BufferedReader(new FileReader(f)); String key; while((key=br.readLine())!=null){ javaKeys.put(key.trim(),new Long(0)); } br.close(); } catch(Exception e) { e.printStackTrace(); } } // This function returns false if the token is a Java keyword or stopword // Else it returns true so that the token is retained static boolean categorize(String s){ // Split current token, if need be ArrayList al = kwe.processCode(s); Iterator it = al.iterator(); // For each split part, check if it is a java keyword, etc. while(it.hasNext()){ String ss = (String) it.next(); if(s!=null && !javaKeys.containsKey(ss) && ss.indexOf('.')==-1){ if (!ss.matches("\\d*")) return true; } } return false; } // This function recurses into the source directory containing .java source files // It tokenizes each .java file, removes comments, public static void recurse(String baseDir, String mirrorDir) throws IOException, InterruptedException { // Initialize a stream tokenizers File dir = new File(baseDir); String[] files = dir.list(); for (String file : files) { // If the file is a subdirectory, recurse if (new File(baseDir + "/" + file).isDirectory()) recurse(baseDir + "/" + file, mirrorDir + "/" + file); else { // Check if file ends in .java and if it does, tokenize it if (file.endsWith(".java") == false) continue; // Initialize a stream tokenizer FileReader rd = new FileReader(baseDir + "/" + file); StreamTokenizer st = new StreamTokenizer(rd); // Prepare the tokenizer for Java-style tokenizing rules st.parseNumbers(); st.wordChars('_', '_'); st.eolIsSignificant(true); // Discard comments st.slashSlashComments(true); st.slashStarComments(true); // Parse file int token = st.nextToken(); String content = ""; String previous = ""; while (token != StreamTokenizer.TT_EOF) { switch (token) { case StreamTokenizer.TT_WORD: // Check if it is a package name from package import statement if (previous.compareTo("package") == 0 || previous.compareTo("import") == 0) { String[] fields = st.sval.split("\\."); for (int i=0; i<fields.length; i++) { previous = fields[i]; if (categorize(fields[i])) content += fields[i] + " "; } break; } previous = st.sval; // Check if the word a stopword, java keyword, etc. // If not, append it to the content to be written back if (categorize(st.sval)) content += st.sval.toLowerCase() + " "; break; case StreamTokenizer.TT_NUMBER: // Check for numbers, decimal and hexadecimal if ((token = st.nextToken()) != StreamTokenizer.TT_EOF) { if (token == StreamTokenizer.TT_WORD && st.sval.startsWith("x")) ; else st.pushBack(); } else st.pushBack(); break; default: // Ignore every other case break; } token = st.nextToken(); } rd.close(); // Write content to the file if (content.length() != 0) { File newDir = new File(mirrorDir); if (newDir.exists() == false) newDir.mkdirs(); FileWriter wt = null; wt = new FileWriter(mirrorDir + "/" + file); wt.write(content); wt.close(); } } } } public static void main(String[] argv) throws IOException, InterruptedException { init("keywords"); // argv[0] is the base directory where data directory sits // Recurse through the directories // Whenever we find a .java file, we tokenize it and write // it down in corresponding mirrored directoy String dataDir = argv[0] + "/data"; String mirrorDir = argv[0] + "/mallet-data"; // A sample snapshot of the data directory is as following // ├── thrift // │   ├── thrift-0.6.0 // │   ├── thrift-0.7.0 // │   └── thrift-0.8.0 // ├── tomcat // │   ├── apache-tomcat-5.5.12 // │   ├── apache-tomcat-6.0.0 // │   ├── apache-tomcat-7.0.10 // │   ├── jakarta-tomcat-3.3.1 // │   ├── jakarta-tomcat-4.1.27 // │   └── jakarta-tomcat-5.0.28 // └── xerces // ├── xerces-2_10_0 // ├── xerces-2_11_0 // └── xerces-2_9_1 // The individual directories contain source code obtained from the respective // websites // Mirror directory structure while retaining only tokenized .java source files recurse(dataDir, mirrorDir); } }