package de.bitocean.mm.importer;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import javax.swing.JFrame;
import org.semanpix.parser.TikaGUI;
import org.apache.tika.metadata.*;
/**
*
* @author kamir
*/
public class PlainTextFromPDFExtractor {
public static String PDF_BASEPATH = "/Users/" + System.getProperty("user.name") + "/Desktop/exp4/pdf";
public static String TXT_BASEPATH = "/Users/" + System.getProperty("user.name") + "/Desktop/exp4/txt";
public static void main(String[] args) throws Exception {
File fPDF = new File(PDF_BASEPATH);
System.out.println(">>> (PDF) f=" + fPDF.getAbsolutePath());
File[] pdfs = fPDF.listFiles();
System.out.println(">>> number of files: " + pdfs.length);
File fSCHEMA = new File("./morphline-projects/corpus1/conf/schema.xml");
System.out.println(">>> (SCHEMA) exists:" + fSCHEMA.exists() + ": " + fSCHEMA.getAbsolutePath());
StringBuffer sb = new StringBuffer();
BufferedReader br = new BufferedReader(new FileReader(fSCHEMA));
while (br.ready()) {
sb.append(br.readLine() + "\n");
}
String SCHEMAXML = sb.toString();
int z = 1;
for (File f : pdfs) {
if (f.getName().endsWith(".pdf")) {
// System.out.println(" * " + f.getAbsolutePath());
String fn = f.getName();
int i = fn.length() - 4;
String fnSUB = fn.substring(0, i);
fnSUB = fnSUB.replaceAll("/", "_");
fnSUB = fnSUB.replaceAll(":", "_");
String txtFN = TXT_BASEPATH + "/" + fnSUB + ".txt";
Metadata md = TikaGUI.getMetadataFromFile(f);
String textContent = TikaGUI.getTextContent();
System.out.println(z + ") >" + textContent.split(" ").length + " # " + txtFN);
BufferedWriter brOut = new BufferedWriter(new FileWriter(new File(txtFN)));
brOut.write(textContent);
brOut.flush();
brOut.close();
z++;
}
}
//
//
// PropertyFieldMatcher pfm = new PropertyFieldMatcher();
// pfm.process( md, fPDF, SCHEMAXML, textContent );
//
System.exit( 0 );
}
}