package edu.unc.ils.mrc.hive.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
import com.aliasi.io.FileExtensionFilter;
import edu.unc.ils.mrc.hive.api.SKOSConcept;
import edu.unc.ils.mrc.hive.api.SKOSSearcher;
import edu.unc.ils.mrc.hive.api.SKOSServer;
import edu.unc.ils.mrc.hive.api.SKOSTagger;
import edu.unc.ils.mrc.hive.api.impl.elmo.SKOSServerImpl;
/**
* Simple command line tagger. Given a directory of PDF files, generates text files
* with keyphrases from the specified vocabulary.
*/
public class BatchTagger
{
static final int NUM_TERMS = 20;
public static void main(String[] args)
{
if (args.length != 3) {
System.err.println("Usage: java " + BatchTagger.class.getName() + "[path to hive.properties] [path to directory] [vocabulary]");
return;
}
// Path to hive.properties
String confPath = args[0];
// Path to directory containing PDF files
String inputPath = args[1];
// Vocabulary name
String vocabulary = args[2];
// Algorithm for indexing
String algorithm = "maui";
File inputDir = new File(inputPath);
List<String> vocabularies = new ArrayList<String>();
vocabularies.add(vocabulary);
SKOSServer server = new SKOSServerImpl(confPath);
SKOSTagger tagger = server.getSKOSTagger(algorithm);
SKOSSearcher searcher = server.getSKOSSearcher();
File[] files = inputDir.listFiles(new FileExtensionFilter("pdf"));
for (File file: files) {
try
{
String pdfName = file.getAbsolutePath();
TextManager tm = new TextManager();
String text = tm.getPlainText(new FileInputStream(file));
String textFileName = pdfName.substring(0, pdfName.lastIndexOf('.')) + ".txt";
String keyFileName = pdfName.substring(0, pdfName.lastIndexOf('.')) + ".key";
FileWriter keyFileWriter = new FileWriter(keyFileName);
List<SKOSConcept> concepts = tagger.getTags(file.getAbsolutePath(), vocabularies, searcher, NUM_TERMS, 2);
for (SKOSConcept concept : concepts) {
keyFileWriter.write(concept.getPrefLabel() + "\r\n");
}
keyFileWriter.close();
FileWriter textFileWriter = new FileWriter(textFileName);
textFileWriter.write(text);
textFileWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}