package jhazm.terminal;
import edu.stanford.nlp.ling.TaggedWord;
import jhazm.*;
import jhazm.tokenizer.WordTokenizer;
import org.apache.commons.cli.*;
import org.apache.commons.io.output.StringBuilderWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.maltparser.concurrent.graph.ConcurrentDependencyGraph;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
/**
* Created by majid on 12/31/14.
*/
public class Runner {
public static void main(String[] args) throws IOException {
// create the Options
Options options = new Options();
options.addOption("a", "action", true, "action to do. " + StringUtils.join(Action.values(), ", ") + ", are the options.");
options.addOption("i", "input", true, "input, standard input file file.");
options.addOption("t", "text", true, "input text");
options.addOption("o", "output", true, "output, standard output file.");
options.addOption("v", "verbose", true, "show output string on console. console may not support UTF-8 in some operating systems.");
CommandLineParser parser = new BasicParser();
Action action = null;
Path inputPath, outputPath = null;
String inputText = null;
boolean verbose = true;
try {
CommandLine line = parser.parse(options, args);
if (!line.hasOption("a"))
showHelp(options);
try {
action = Action.valueOf(line.getOptionValue("a"));
} catch (Exception exp) {
System.err.println("wrong action.");
System.exit(1);
}
if (action == null) showHelp(options);
String inputFile = null;
if (!line.hasOption("i") && !line.hasOption("t")) {
inputFile = "resources/sample.txt";
}
if (line.hasOption("i")) {
inputFile = line.getOptionValue("i");
}
if (((inputFile == null && !line.hasOption("t")) || !line.hasOption("o")))
showHelp(options);
if (inputFile != null) {
inputPath = Paths.get(inputFile);
if (!Files.exists(inputPath)) {
System.err.println("file does not exists: " + inputPath.toFile().getAbsolutePath());
System.exit(1);
} else {
System.err.println("input file: " + inputPath.toFile().getAbsolutePath());
byte[] encoded = Files.readAllBytes(inputPath);
inputText = new String(encoded, "UTF-8");
}
}
if (line.hasOption("t")) inputText = line.getOptionValue("t");
if (line.hasOption("o")) {
outputPath = Paths.get(line.getOptionValue("o"));
System.err.println("output path is: " + outputPath.toFile().getAbsolutePath());
}
if (line.hasOption("v")) verbose = line.getOptionValue("v").equals("true");
} catch (ParseException exp) {
System.err.println(exp);
showHelp(options);
}
assert action != null;
assert outputPath != null;
Normalizer normalizer = new Normalizer();
inputText = normalizer.run(inputText);
WordTokenizer tokenizer = new WordTokenizer();
List<String> tokens = tokenizer.tokenize(inputText);
StringBuilder builder = new StringBuilder();
System.err.println("working directory: " + System.getProperty("user.dir"));
try {
switch (action) {
case stemming:
System.err.println("stemming, text = " + inputText);
Stemmer stemmer = new Stemmer();
for (String token : tokens) {
String stem = stemmer.stem(token);
if (verbose) System.err.println(stem);
builder.append(stem).append(' ');
}
String stem = stemmer.stem(inputText);
if (verbose) System.err.println(stem);
Files.write(outputPath, stem.getBytes("UTF-8"));
break;
case normalizing:
System.err.println("notmalizing, text = " + inputText);
if (verbose) System.err.println(inputText);
Files.write(outputPath, inputText.getBytes("UTF-8"));
break;
case workTokenizing:
System.err.println("tokenizing, text = " + inputText);
String tokenized = StringUtils.join(tokens, " ");
if (verbose) System.err.println(tokenized);
assert tokenized != null;
Files.write(outputPath, tokenized.getBytes("UTF-8"));
break;
case sentenceTokenizing:
System.err.println("tokenizing, text = " + inputText);
tokenized = StringUtils.join(tokens, " ");
if (verbose) System.err.println(tokenized);
assert tokenized != null;
Files.write(outputPath, tokenized.getBytes("UTF-8"));
break;
case lemmatize:
System.err.println("lemmatize, text = " + inputText);
Lemmatizer lemmatizer = new Lemmatizer();
for (String token : tokens) {
String lemma = lemmatizer.lemmatize(token);
if (verbose) System.err.println(lemma);
builder.append(lemma).append(' ');
}
Files.write(outputPath, builder.toString().getBytes("UTF-8"));
break;
case partOfSpeechTagging:
System.err.println("part of speech tagging, text = " + inputText);
POSTagger posTagger = new POSTagger();
List<TaggedWord> tagged = posTagger.batchTag(tokens);
if (verbose) {
for (TaggedWord taggedWord : tagged) {
System.err.println(taggedWord.word() + "\t" + taggedWord.tag());
builder.append(taggedWord.word()).append("\t").append(taggedWord.tag()).append("\r\n");
}
}
Files.write(outputPath, builder.toString().getBytes("UTF-8"));
break;
case dependencyParsing:
System.err.println("dependency parser, text = " + inputText);
DependencyParser dependencyParser = new DependencyParser();
posTagger = new POSTagger();
tagged = posTagger.batchTag(tokens);
ConcurrentDependencyGraph graph = dependencyParser.rawParse(tagged);
String output = graph.toString();
System.err.println(output);
Files.write(outputPath, output.getBytes("UTF-8"));
break;
default:
break;
}
} catch (Exception e) {
System.err.println(e);
}
}
private static void showHelp(Options options) {
HelpFormatter formatter = new HelpFormatter();
final StringBuilder helpBuilder = new StringBuilder().append('\n');
helpBuilder.append("Welcome to JHazm.").append('\n');
PrintWriter pw = new PrintWriter(new StringBuilderWriter(helpBuilder));
formatter.printHelp(pw, 80, "java -jar jhazm.jar", null,
options, 0, 0, "Thank you", false);
helpBuilder.append("Required options for stemmer: --i or --t, --o").append('\n');
System.err.println(helpBuilder);
System.exit(0);
}
}