package lda.wikievidence.modelcreation; public class ConfigCreation { public static byte[] createStandardPLDAConfig(String datafilePath, String modelOutputPath) { StringBuffer buffer = new StringBuffer(); buffer.append("import scalanlp.io._;"); buffer.append("import scalanlp.stage._;"); buffer.append("import scalanlp.stage.text._;"); buffer.append("import scalanlp.text.tokenize._;"); buffer.append("import scalanlp.pipes.Pipes.global._;"); buffer.append("import edu.stanford.nlp.tmt.stage._;"); buffer.append("import edu.stanford.nlp.tmt.model.lda._;"); buffer.append("import edu.stanford.nlp.tmt.model.llda._;"); buffer.append("import edu.stanford.nlp.tmt.model.plda._;"); buffer.append("val source = CSVFile(\"" + datafilePath + "\") ~> IDColumn(1);"); buffer.append("val tokenizer = {"); buffer.append("SimpleEnglishTokenizer() ~>"); buffer.append("CaseFolder() ~>"); buffer.append("WordsAndNumbersOnlyFilter() ~>"); buffer.append("MinimumLengthFilter(3)"); buffer.append("}"); buffer.append("val text = {"); buffer.append("source ~>"); buffer.append("Column(4) ~>"); buffer.append("TokenizeWith(tokenizer) ~>"); buffer.append("TermCounter() ~>"); buffer.append("TermMinimumDocumentCountFilter(4) ~>"); buffer.append("TermDynamicStopListFilter(30) ~>"); buffer.append("DocumentMinimumLengthFilter(5)"); buffer.append("}"); buffer.append("val labels = {"); buffer.append("source ~>"); buffer.append("Column(2) ~>"); buffer.append("TokenizeWith(WhitespaceTokenizer()) ~>"); buffer.append("TermCounter() ~>"); buffer.append("TermMinimumDocumentCountFilter(10)"); buffer.append("}"); buffer.append("val dataset = LabeledLDADataset(text, labels);"); buffer.append("val numBackgroundTopics = 1;"); buffer.append("val numTopicsPerLabel = SharedKTopicsPerLabel(1);"); buffer.append("val modelParams = PLDAModelParams(dataset, numBackgroundTopics, numTopicsPerLabel, termSmoothing = 0.01, topicSmoothing = 0.01);"); buffer.append("val modelPath = file(\"" + modelOutputPath + "\");"); buffer.append("TrainCVB0PLDA(modelParams, dataset, output = modelPath, maxIterations = 50);"); return buffer.toString().getBytes(); } public static byte[] createEvidenceExtractionConfig(String datafilePath, String modelOutputPath, int nrTopTerms) { StringBuffer buffer = new StringBuffer(); buffer.append("import scalanlp.io._;"); buffer.append(System.lineSeparator()); buffer.append("import scalanlp.stage._;"); buffer.append(System.lineSeparator()); buffer.append("import scalanlp.stage.text._;"); buffer.append(System.lineSeparator()); buffer.append("import scalanlp.text.tokenize._;"); buffer.append(System.lineSeparator()); buffer.append("import scalanlp.pipes.Pipes.global._;"); buffer.append(System.lineSeparator()); buffer.append("import edu.stanford.nlp.tmt.stage._;"); buffer.append(System.lineSeparator()); buffer.append("import edu.stanford.nlp.tmt.model.lda._;"); buffer.append(System.lineSeparator()); buffer.append("import edu.stanford.nlp.tmt.model.llda._;"); buffer.append(System.lineSeparator()); buffer.append("import edu.stanford.nlp.tmt.model.plda._;"); buffer.append(System.lineSeparator()); buffer.append("val source = CSVFile(\"" + datafilePath + "\") ~> IDColumn(1);"); buffer.append(System.lineSeparator()); buffer.append("val tokenizer = {"); buffer.append(System.lineSeparator()); buffer.append("SimpleEnglishTokenizer() ~>"); buffer.append(System.lineSeparator()); buffer.append("CaseFolder() ~>"); buffer.append(System.lineSeparator()); buffer.append("WordsAndNumbersOnlyFilter() ~>"); buffer.append(System.lineSeparator()); buffer.append("MinimumLengthFilter(3)"); buffer.append(System.lineSeparator()); buffer.append("}"); buffer.append(System.lineSeparator()); buffer.append("val text = {"); buffer.append(System.lineSeparator()); buffer.append("source ~>"); buffer.append(System.lineSeparator()); buffer.append("Column(3) ~>"); buffer.append(System.lineSeparator()); buffer.append("TokenizeWith(tokenizer) ~>"); buffer.append(System.lineSeparator()); buffer.append("TermCounter() ~>"); buffer.append(System.lineSeparator()); buffer.append("TermMinimumDocumentCountFilter(5) ~>"); buffer.append(System.lineSeparator()); buffer.append("TermStopListFilter(scala.io.Source.fromFile(\"stopwords.txt\").getLines().toList) ~>"); buffer.append(System.lineSeparator()); buffer.append("TermDynamicStopListFilter(50) ~>"); buffer.append(System.lineSeparator()); buffer.append("DocumentMinimumLengthFilter(5)"); buffer.append(System.lineSeparator()); buffer.append("}"); buffer.append(System.lineSeparator()); buffer.append("val labels = {"); buffer.append(System.lineSeparator()); buffer.append("source ~>"); buffer.append(System.lineSeparator()); buffer.append("Column(2) ~>"); buffer.append(System.lineSeparator()); buffer.append("TokenizeWith(WhitespaceTokenizer()) ~>"); buffer.append(System.lineSeparator()); buffer.append("TermCounter() ~>"); buffer.append(System.lineSeparator()); buffer.append("TermMinimumDocumentCountFilter(1)"); buffer.append(System.lineSeparator()); buffer.append("}"); buffer.append(System.lineSeparator()); buffer.append("val dataset = LabeledLDADataset(text, labels);"); buffer.append(System.lineSeparator()); buffer.append("val numBackgroundTopics = 2;"); buffer.append(System.lineSeparator()); buffer.append("val numTopicsPerLabel = SharedKTopicsPerLabel(1);"); buffer.append(System.lineSeparator()); buffer.append("val modelParams = PLDAModelParams(dataset, numBackgroundTopics, numTopicsPerLabel, termSmoothing = 0.01, topicSmoothing = 0.01);"); buffer.append(System.lineSeparator()); buffer.append("val modelPath = file(\"" + modelOutputPath + "\");"); buffer.append(System.lineSeparator()); buffer.append("TrainCVB0PLDA(modelParams, dataset, output = modelPath, maxIterations = 25);"); return buffer.toString().getBytes(); } }