package org.iswc.iswc2012main;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeMap;
import java.util.TreeSet;
import org.iswc.util.ToolText2Rdf;
import sw4j.util.DataSmartMap;
import sw4j.util.Sw4jException;
import sw4j.util.ToolIO;
public class DataPaperInPdf {
public enum STATE{
ready,
title,
author ,
affiliation,
_abstract,
keyword,
content,
}
TreeMap<STATE,String> data = new TreeMap<STATE,String>();
public STATE state = STATE.ready;
String id = null;
public DataPaperInPdf(String id) {
super();
this.id = id;
}
private void appendContent(STATE prop, String line){
String content = data.get(prop);
if (null==content){
content = line;
}else{
content += " " +line.trim();
}
data.put(prop, content);
}
static TreeSet<String> setName = new TreeSet<String>();
private boolean isAuthorLine(String line){
if (setName.size()==0){
File fName = new File("local/misc/name.txt");
if (fName.exists()){
for (String name: pipeFileToStringArray(fName)){
for (String temp: name.split(" ")){
temp = ToolText2Rdf.extractLocalName(temp);
setName.add(temp);
}
}
}
}
String[] words = line.split("[\\s,]");
int index_base =0;
if (words.length==0)
return false;
if (words[0].equals("and"))
index_base++;
for (int i=index_base; i<words.length; i++){
String name =words[i];
name = ToolText2Rdf.extractLocalName(name);
boolean ret = setName.contains(name);
if (ret){
return true;
}
}
return false;
}
public static List<String> pipeFileToStringArray(File f){
ArrayList<String> ret = new ArrayList<String>();
try {
BufferedReader reader;
reader = new BufferedReader(new InputStreamReader(ToolIO.prepareFileInputStream(f)));
String line;
while (null!=(line=reader.readLine())){
ret.add(line);
}
} catch (Sw4jException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return ret;
}
int lineNumber = 0;
int cntChar = 0;
public STATE processLine(String line){
lineNumber++;
cntChar += line.length();
String tempStr = line.replaceAll("\\s+", " ").toLowerCase();
tempStr= tempStr.trim();
if (tempStr.length()==0)
return state;
System.out.println(String.format("%03d:%s\t%s", lineNumber, state, line));
if (STATE.ready.equals(state)){
System.out.println("---title--");
data.put(STATE.ready, id);
state = STATE.title;
}else if (!STATE.content.equals(state)&& tempStr.startsWith("abstract.")){
System.out.println("---abstract--");
state = STATE._abstract;
line = line.substring("abstract.".length()).trim();
}else if (!STATE.content.equals(state)&& tempStr.startsWith("abstract")){
System.out.println("---abstract--");
state = STATE._abstract;
line = line.substring("abstract".length()).trim();
}else if (STATE.title.equals(state)&& tempStr.indexOf("springer")>0){
System.out.println("---title--");
data.put(STATE.title, null);//reset
line = "";
}else if (STATE.title.equals(state)&& isAuthorLine(tempStr)){
System.out.println("---author--");
state = STATE.author;
}else if (STATE.author.equals(state)&& !isAuthorLine(tempStr)){
System.out.println("---affiliation--");
state = STATE.affiliation;
}else if (STATE._abstract.equals(state)&& tempStr.startsWith("keywords:")){
System.out.println("---keywords--");
state = STATE.keyword;
line = line.substring("keywords:".length()).trim();
}else if (tempStr.startsWith("1 introduction")){
System.out.println("---INTRODUCTION--");
state = STATE.content;
}else if (tempStr.startsWith("1 motivation")){
System.out.println("---MOTIVATION--");
state = STATE.content;
}else if (tempStr.startsWith("1 sparql")){
System.out.println("---SPARQL--");
state = STATE.content;
}else if (tempStr.startsWith("1 background")){
System.out.println("---Background--");
state = STATE.content;
}else if (tempStr.startsWith("1 research")){
System.out.println("---Research--");
state = STATE.content;
}else if (tempStr.startsWith("1 problem")){
System.out.println("---Problem--");
state = STATE.content;
}else if (tempStr.startsWith("1 smart")){
System.out.println("---Smart--");
state = STATE.content;
}else if (cntChar >5000){
if (!STATE.content.equals(state)){
System.out.println("---5000--");
state = STATE.content;
}
}
if (line.length()==0)
return state;
appendContent(state, line);
return state;
}
public void cleanup(){
for (STATE state: STATE.values()){
String value = data.get(state);
if (null==value)
continue;
if (STATE._abstract.equals(state)){
value = value.replaceAll("- ", "");
}
// value.replaceAll("’", "'");
value.replaceAll("�", "\"");
value.replaceAll("�", "\"");
// value.replaceAll("–","-");
data.put(state, value);
}
}
public void printReport(){
cleanup();
System.out.println("----report of paper content-----");
DataSmartMap dsm = new DataSmartMap();
for (STATE state: STATE.values()){
dsm.addStringProperty(state.toString());
if (null==data.get(state)){
data.put(state, "");
}
}
for(STATE state: data.keySet()){
dsm.put(state.name(), data.get(state));
System.out.println(String.format("[%s]\t%s", state, data.get(state)));
}
File fOut = new File("local/output/pdf.tsv");
try {
ToolIO.pipeStringToFile( String.format("%s\t%s\n", dsm.toTSVrow(), dsm.toTSVheader()),fOut.getAbsolutePath(), false, true);
} catch (Sw4jException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}