Main.java example

Explorer
topic-modeling-master
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;


public class Main {

	static KeywordExtractor kwe;
	static HashMap<String, Long> javaKeys;

	public static void init(String s)
	{
		kwe = KeywordExtractor.getInstance();
		javaKeys = new HashMap<String, Long>();

		try{
			File f = new File(s);
			BufferedReader br = new BufferedReader(new FileReader(f));
			String key;
			while((key=br.readLine())!=null){
				javaKeys.put(key.trim(),new Long(0));
			}
			br.close();
		} catch(Exception e) {
			e.printStackTrace();
		}
	}

	// This function returns false if the token is a Java keyword or stopword
	// Else it returns true so that the token is retained
	static boolean categorize(String s){
		// Split current token, if need be
		ArrayList al = kwe.processCode(s);
		Iterator it = al.iterator();
		// For each split part, check if it is a java keyword, etc.
		while(it.hasNext()){
			String ss = (String) it.next();
			if(s!=null && !javaKeys.containsKey(ss) && ss.indexOf('.')==-1){
				if (!ss.matches("\\d*"))
					return true;
			}
		}
		return false;
	}

	// This function recurses into the source directory containing .java source files
	// It tokenizes each .java file, removes comments,
	public static void recurse(String baseDir, String mirrorDir) throws IOException, InterruptedException
	{
		// Initialize a stream tokenizers

		File dir = new File(baseDir);
		String[] files = dir.list();

		for (String file : files) {
			// If the file is a subdirectory, recurse
			if (new File(baseDir + "/" + file).isDirectory())
				recurse(baseDir + "/" + file, mirrorDir + "/" + file);
			else {
				// Check if file ends in .java and if it does, tokenize it
				if (file.endsWith(".java") == false)
					continue;
				// Initialize a stream tokenizer
				FileReader rd = new FileReader(baseDir + "/" + file);
				StreamTokenizer st = new StreamTokenizer(rd);

				// Prepare the tokenizer for Java-style tokenizing rules
				st.parseNumbers();
				st.wordChars('_', '_');
				st.eolIsSignificant(true);

				// Discard comments
				st.slashSlashComments(true);
				st.slashStarComments(true);

				// Parse file
				int token = st.nextToken();
				String content = "";
				String previous = "";
				while (token != StreamTokenizer.TT_EOF) {
					switch (token) {
					
					case StreamTokenizer.TT_WORD:
						// Check if it is a package name from package import statement
						if (previous.compareTo("package") == 0 || previous.compareTo("import") == 0) {
							String[] fields = st.sval.split("\\.");
							for (int i=0; i<fields.length; i++) {
								previous = fields[i];
								if (categorize(fields[i]))
									content += fields[i] + " ";
							}
							break;
						}
						previous = st.sval;
						// Check if the word a stopword, java keyword, etc.
						// If not, append it to the content to be written back
						if (categorize(st.sval))
							content += st.sval.toLowerCase() + " ";
						break;
						
					case StreamTokenizer.TT_NUMBER:
						// Check for numbers, decimal and hexadecimal
						if ((token = st.nextToken()) != StreamTokenizer.TT_EOF) {
							if (token == StreamTokenizer.TT_WORD && st.sval.startsWith("x"))
								;
							else
								st.pushBack();
						}
						else
							st.pushBack();
						break;
						
					default:
						// Ignore every other case
						break;
					}
					token = st.nextToken();
				}
				rd.close();

				// Write content to the file
				if (content.length() != 0) {
					File newDir = new File(mirrorDir);
					if (newDir.exists() == false)
						newDir.mkdirs();
					FileWriter wt = null;
					wt = new FileWriter(mirrorDir + "/" + file);

					wt.write(content);
					wt.close();
				}
			}
		}
	}

	public static void main(String[] argv) throws IOException, InterruptedException
	{
		init("keywords");
		// argv[0] is the base directory where data directory sits
		// Recurse through the directories
		// Whenever we find a .java file, we tokenize it and write
		// it down in corresponding mirrored directoy
		String dataDir = argv[0] + "/data";
		String mirrorDir = argv[0] + "/mallet-data";

		// A sample snapshot of the data directory is as following
		// ├── thrift
		// │   ├── thrift-0.6.0
		// │   ├── thrift-0.7.0
		// │   └── thrift-0.8.0
		// ├── tomcat
		// │   ├── apache-tomcat-5.5.12
		// │   ├── apache-tomcat-6.0.0
		// │   ├── apache-tomcat-7.0.10
		// │   ├── jakarta-tomcat-3.3.1
		// │   ├── jakarta-tomcat-4.1.27
		// │   └── jakarta-tomcat-5.0.28
		// └── xerces
		//     ├── xerces-2_10_0
		//     ├── xerces-2_11_0
		//     └── xerces-2_9_1
		
		// The individual directories contain source code obtained from the respective
		// websites

		// Mirror directory structure while retaining only tokenized .java source files
		recurse(dataDir, mirrorDir);
	}
}