package semanticMarkup.ling.learn.auxiliary; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import semanticMarkup.core.Treatment; public class FileLoader { private String dir; private int count=0; private List<String> fileNameList = new LinkedList<String>(); private List<Integer> typeList = new ArrayList<Integer>(); private List<String> textList = new LinkedList<String>(); private List<Treatment> treatmentList = new LinkedList<Treatment>(); public FileLoader(String d) { this.dir = d; } public boolean load() { //System.out.println("Reading sentences:\n"); File myDir = new File(this.dir); File[] contents = myDir.listFiles(); this.count=contents.length; for (int i = 0; i < count; i++) { File f = contents[i]; //System.out.println(f.getName() + ": " + f.length()); String name = f.getName(); int type = this.getType(f.getName()); //System.out.println(type); String s2 = this.dir+"/"+f.getName(); //System.out.println(s2); File file = new File(s2); BufferedReader reader = null; String text = null; try { reader = new BufferedReader(new FileReader(file)); text = reader.readLine(); //System.out.println(text); reader.close(); } catch (IOException e) { e.printStackTrace(); return false; } this.fileNameList.add(name); this.typeList.add(type); this.textList.add(text); Treatment tm = new Treatment(name,text); this.treatmentList.add(tm); } //for (int i=0;i<fileNameList.size();i++) { // System.out.println(fileNameList.get(i)+sentList.get(i)+'\n'); //} return true; } //determine if a file contains a character statement or a character state(description) statement, by the file name //0-character, 1-description private int getType(String fileName) { //System.out.println(fileName); fileName = fileName.replaceAll(".*\\.xml_","");//remove pdf.xml fileName = fileName.replaceAll("[^_]","");//remove all non_ charaters //System.out.println(fileName); if (fileName.length()==1) return 1; else return 0; } public List<String> getFileNameList() { return this.fileNameList; } public List<Integer> getTypeList() { return this.typeList; } public List<String> getTextList() { return this.textList; } public List<Treatment> getTreatmentList(){ return this.treatmentList; } public int getCount() { return this.count; } public List<String> getUnknownWordList() { List<String> unknownList = new LinkedList<String>(); for (int i=0;i<count;i++) { //System.out.println(i); //System.out.println(sentList.get(i)); //System.out.println(fileNameList.get(i)); if (textList.get(i)!= null) { //System.out.println(sentList.get(i).length()); //System.out.println("\n"); String[] tokenList = ((textList.get(i)).toLowerCase()).split("\\s"); for (int x=0; x<tokenList.length; x++) { //System.out.println(i); //System.out.println(tokenList.length); System.out.println(tokenList[x]); unknownList.add(tokenList[x]); } } } return unknownList; } }