package context.core.task.lexisnexis;
import context.core.util.CSVWriter;
import context.core.util.DistanceUtil;
import context.core.util.MyPair;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.openide.util.Exceptions;
/**
*
* @author Jana Diesner
*/
public class LxNxDataProvider {
private int idCounter = 0;
private String subectsOutputDir;
private String metadataFile;
private String year = "";
private Hashtable<String, Integer> htSubjectsPerData = new Hashtable<String, Integer>();
private HashSet<String> hsIdsOfConsideredTexts = new HashSet<String>();
private boolean disregardTextIdList = true;
private List<LxNxMetadata> originalList = new ArrayList<LxNxMetadata>();
private List<LxNxMetadata> uniqueList = new ArrayList<LxNxMetadata>();
private List<LxNxMetadata> duplicateList = new ArrayList<LxNxMetadata>();
private Map<Integer, LxNxMetadata> textIDMap = new HashMap<Integer, LxNxMetadata>();
private Map<Integer, List<Integer>> duplicateMap = new HashMap<Integer, List<Integer>>();
private Map<Integer, List<Integer>> docGraph = new HashMap<Integer, List<Integer>>();
private Map<Integer, Boolean> mark = new HashMap<Integer, Boolean>();
private String uniqueFile;
private String duplicateFile;
private String fromDirectory;
private String toDirectory;
private String duplicateDirectory;
private String uniqueDirectory;
private String textBodyToDirectory;
private String textBodyDuplicateDirectory;
private String textBodyUniqueDirectory;
private String uniqueAllTextFile;
private String gexfAllNetworkFile;
private String gexfPersonNetworkFile;
private String gexfSubjectNetworkFile;
private String networkDir;
private List<CodebookEntity> codebook = new ArrayList<CodebookEntity>();
private Map<String, CodebookEntity> codebookMap = new HashMap<String, CodebookEntity>();
private String codebookCSVFile;
private String codebookRejectCSVFile;
private boolean genOutputFiles;
List<String> codebookTemplate = Arrays.asList("Text", "Name", "Percent", "Type", "Subtype");
private Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> allGraph = new HashMap<CodebookEntity, List<MyPair<CodebookEntity, Integer>>>();
private Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> personGraph = new HashMap<CodebookEntity, List<MyPair<CodebookEntity, Integer>>>();
private Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> subjectGraph = new HashMap<CodebookEntity, List<MyPair<CodebookEntity, Integer>>>();
private String fileSeperator = System.getProperty("file.separator");
// private static double treshold = 70;
/**
*
*/
public LxNxDataProvider() {
}
/**
*
* @return
*/
public List<LxNxMetadata> getUniqueList() {
return uniqueList;
}
private void clearAllVariables() {
uniqueList.clear();
duplicateList.clear();
duplicateMap.clear();
// originalList.clear();
allGraph.clear();
personGraph.clear();
subjectGraph.clear();
textIDMap.clear();
codebook.clear();
codebookMap.clear();
docGraph.clear();
hsIdsOfConsideredTexts.clear();
htSubjectsPerData.clear();
}
/**
*
* @param originalDir
* @param networkDir
*/
public void generateNetwork(String originalDir, String networkDir) {
parseAndDeduplicate(originalDir, null, null, null, null, null, null, false);
LxNxDataProvider.initializeDir(networkDir);
setGexfAllNetworkFile(networkDir + fileSeperator + "all_all.gexf");
setGexfPersonNetworkFile(networkDir + fileSeperator + "person_person.gexf");
setGexfSubjectNetworkFile(networkDir + fileSeperator + "subject_subject.gexf");
generateCodebook(uniqueList, null, null);
}
/**
*
* @param list
* @param outputDir
* @param type1
* @param type2
* @param threshold
*/
public void generateAggrMDNetwork(List<LxNxMetadata> list, String outputDir, MetadataType type1, MetadataType type2, int threshold) {
generateCustomNetwork(list, outputDir, type1, type2, threshold, false);
}
/**
*
* @param list
* @param outputDir
* @param type1
* @param type2
* @param threshold
*/
public void generateVerboseMDNetwork(List<LxNxMetadata> list, String outputDir, MetadataType type1, MetadataType type2, int threshold) {
generateCustomNetwork(list, outputDir, type1, type2, threshold, true);
}
/**
*
* @param list
* @param outputDir
* @param type1
* @param type2
* @param threshold
*/
public void generateBothMDNetwork(List<LxNxMetadata> list, String outputDir, MetadataType type1, MetadataType type2, int threshold) {
generateCustomNetwork(list, outputDir, type1, type2, threshold, false);
generateCustomNetwork(list, outputDir, type1, type2, threshold, true);
}
private void generateCustomNetwork(List<LxNxMetadata> list, String outputDir, MetadataType type1, MetadataType type2, int threshold, boolean verbose) {
LxNxDataProvider.initializeDirSafe(outputDir);
String name = type1.getValue() + "_" + type2.getValue();
if (verbose) {
name += "_verbose";
} else {
name += "_aggr";
}
String codebookCSVFile_ = outputDir + fileSeperator + name + ".csv";
String codebookRejectCSVFile_ = outputDir + fileSeperator + name + ".reject";
generateCustomCodebook(list, type1, type2, codebookCSVFile_, codebookRejectCSVFile_, threshold, verbose);
}
private void parseAndDeduplicate(String originalDir, String parsedDir, String uniqueDir, String duplicateDir, String metadataXLSFile, String uniqueXLSFile, String duplicateXLSFile, String codebookCSVFile, String codebookRejectCSVFile) {
parseAndDeduplicate(originalDir, parsedDir, uniqueDir, duplicateDir, metadataXLSFile, uniqueXLSFile, duplicateXLSFile, true);
this.codebookCSVFile = codebookCSVFile;
this.codebookRejectCSVFile = codebookRejectCSVFile;
FileUtils.deleteQuietly(new File(this.codebookCSVFile));
FileUtils.deleteQuietly(new File(this.codebookRejectCSVFile));
generateCodebook(uniqueList, this.codebookCSVFile, this.codebookRejectCSVFile);
}
/**
*
* @param list
* @param type1
* @param type2
* @param codebookCSVFile
* @param codebookRejectCSVFile
* @param threshold
* @param verbose
*/
public void generateCustomCodebook(List<LxNxMetadata> list, MetadataType type1, MetadataType type2, String codebookCSVFile, String codebookRejectCSVFile, int threshold, boolean verbose) {
CSVWriter csvWriter = new CSVWriter(codebookCSVFile, ";", "\n");
CSVWriter csvRejectWriter = new CSVWriter(codebookRejectCSVFile);
Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> graph = new HashMap<CodebookEntity, List<MyPair<CodebookEntity, Integer>>>();
System.out.println(codebookCSVFile + "," + verbose);
csvWriter.append("source,target,weight,type");
CodebookEntity.clearRejectList();
for (LxNxMetadata mdata : list) {
System.out.println("Processing document " + mdata.getTextID() + "...");
List<CodebookEntity> type1_cbs = getCodebookEntities(type1, mdata);
List<CodebookEntity> type2_cbs = getCodebookEntities(type2, mdata);
if (type1.equals(type2)) {
if (verbose) {
for (CodebookEntity ent1 : type1_cbs) {
for (CodebookEntity ent2 : type2_cbs) {
if (ent1.getName().compareTo(ent2.getName()) > 0) {
if ((ent1.getPercent() > threshold) && (ent2.getPercent() > threshold)) {
csvWriter.append(mdata.getTextID() + "," + ent1.getName() + "," + ent1.getPercent() + "," + ent2.getName() + "," + ent2.getPercent());
}
}
}
}
} else {
addToGraph(type1_cbs, graph, threshold);
}
} else {
if (verbose) {
for (CodebookEntity ent1 : type1_cbs) {
for (CodebookEntity ent2 : type2_cbs) {
if ((ent1.getPercent() > threshold) && (ent2.getPercent() > threshold)) {
csvWriter.append(mdata.getTextID() + "," + ent1.getName() + "," + ent1.getPercent() + "," + ent2.getName() + "," + ent2.getPercent());
}
}
}
} else {
addToGraph(type1_cbs, type2_cbs, graph, threshold);
}
}
}
if (!verbose) {
writeGraphToCSV(graph, csvWriter);
}
csvRejectWriter.appendAll(CodebookEntity.getRejectList());
csvWriter.close();
csvRejectWriter.close();
}
/**
*
* @param list
* @param codebookCSVFile
* @param codebookRejectCSVFile
*/
public void generateCodebook(List<LxNxMetadata> list, String codebookCSVFile, String codebookRejectCSVFile) {
CSVWriter csvWriter = null;
CSVWriter csvRejectWriter = null;
if (genOutputFiles) {
csvWriter = new CSVWriter(codebookCSVFile, codebookTemplate);
csvRejectWriter = new CSVWriter(codebookRejectCSVFile);
}
for (LxNxMetadata mdata : list) {
List<CodebookEntity> person_cbs = CodebookEntity.parseLine(mdata.getPerson(), "agent", "specific", mdata.getTextID());
// System.out.println("person:"+ mdata.getTextID() + " " + person_cbs.size());
codebook.addAll(person_cbs);
addToGraph(person_cbs, allGraph);
addToGraph(person_cbs, personGraph);
List<CodebookEntity> geo_cbs = CodebookEntity.parseLine(mdata.getGeo(), "location", "specific", mdata.getTextID());
codebook.addAll(geo_cbs);
addToGraph(geo_cbs, allGraph);
List<CodebookEntity> organization_cbs = CodebookEntity.parseLine(mdata.getOrganization(), "organization", "specific", mdata.getTextID());
codebook.addAll(organization_cbs);
addToGraph(organization_cbs, allGraph);
List<CodebookEntity> company_cbs = CodebookEntity.parseLine(mdata.getCompany(), "organization", "specific", mdata.getTextID());
codebook.addAll(company_cbs);
addToGraph(company_cbs, allGraph);
List<CodebookEntity> subject_cbs = CodebookEntity.parseLine(mdata.getSubject(), "knowledge", "specific", mdata.getTextID());
codebook.addAll(subject_cbs);
// System.out.println("subject:"+ mdata.getTextID() + " " + subject_cbs.size());
addToGraph(subject_cbs, allGraph);
addToGraph(subject_cbs, subjectGraph);
// System.out.println();
// System.out.println("<>");
// System.out.println("");
}
System.out.println("Codebook Size=" + codebook.size());
System.out.println("RejectList Size=" + CodebookEntity.getRejectList().size());
for (CodebookEntity ent : codebook) {
if (!codebookMap.containsKey(ent.getName())) {
codebookMap.put(ent.getName(), ent);
}
}
System.out.println("CodebookMap Size=" + codebookMap.keySet().size());
if (genOutputFiles) {
csvRejectWriter.appendAll(CodebookEntity.getRejectList());
csvWriter.appendAll(codebookMap.values());
csvWriter.close();
csvRejectWriter.close();
}
if (getGexfAllNetworkFile() != null) {
GephiNetworkGenerator allgraphgen = new GephiNetworkGenerator(allGraph, getGexfAllNetworkFile());
allgraphgen.script();
}
if (getGexfPersonNetworkFile() != null) {
GephiNetworkGenerator persongraphgen = new GephiNetworkGenerator(personGraph, getGexfPersonNetworkFile());
persongraphgen.script();
}
if (getGexfSubjectNetworkFile() != null) {
GephiNetworkGenerator subjectgraphgen = new GephiNetworkGenerator(subjectGraph, getGexfSubjectNetworkFile());
subjectgraphgen.script();
}
}
/**
*
* @param originalDir
* @param parsedDir
* @param uniqueDir
* @param duplicateDir
* @param metadataXLSFile
* @param uniqueXLSFile
* @param duplicateXLSFile
*/
public void parseAndDeduplicate(String originalDir, String parsedDir, String uniqueDir, String duplicateDir, String metadataXLSFile, String uniqueXLSFile, String duplicateXLSFile) {
if (originalDir == null || parsedDir == null || uniqueDir == null || duplicateDir == null || metadataXLSFile == null || uniqueXLSFile == null || duplicateXLSFile == null) {
throw new RuntimeException("All of the parameters of parseAndDuplicate should have valid values");
}
parseAndDeduplicate(originalDir, parsedDir, uniqueDir, duplicateDir, metadataXLSFile, uniqueXLSFile, duplicateXLSFile, true);
}
/**
*
* @param originalDir
* @param parsedDir
* @param uniqueDir
* @param duplicateDir
* @param metadataXLSFile
* @param uniqueXLSFile
* @param duplicateXLSFile
* @param genOutputFiles
*/
public void parseAndDeduplicate(String originalDir, String parsedDir, String uniqueDir, String duplicateDir, String metadataXLSFile, String uniqueXLSFile, String duplicateXLSFile, boolean genOutputFiles) {
this.fromDirectory = originalDir;
this.toDirectory = parsedDir;
this.uniqueDirectory = uniqueDir;
this.duplicateDirectory = duplicateDir;
this.metadataFile = metadataXLSFile;
this.uniqueFile = uniqueXLSFile;
this.duplicateFile = duplicateXLSFile;
this.genOutputFiles = genOutputFiles;
clearAllVariables();
// final String fileNameListFile = baseDirectory + "files_to_store.txt";
// subectsOutputDir = baseDirectory + "subject";
/* try {
metadatafw = new FileWriter(metadataFile);
metadatapw = new PrintWriter(metadatafw);
} catch (IOException ex) {
Logger.getLogger(LexisDataToDB.class.getName()).log(Level.SEVERE, null, ex);
}
*/
//hsIdsOfConsideredTexts = this.getIdsOfTextsFromList();
//prepareDB();
//fills db and folder with ALL texts
if (genOutputFiles) {
initializeDir(toDirectory);
initializeDir(uniqueDir);
if (getTextBodyDuplicateDirectory() != null) {
initializeDir(getTextBodyDuplicateDirectory());
}
if (getTextBodyToDirectory() != null) {
initializeDir(getTextBodyToDirectory());
}
if (getTextBodyUniqueDirectory() != null) {
initializeDir(getTextBodyUniqueDirectory());
}
}
processTexts();
if (genOutputFiles) {
LxNxExcelWriter.writetoFile(metadataXLSFile, originalList);
}
// recognizeDuplicates(metadataList);
//in here switch for dedpuing during oad or not
// stores only selected files as specified per hand in DB
//saveSelectedFilesAsPlainText(fromDirectory, toDirectory, fileNameListFile);
removeDuplicatedTextsFromDB();
if (genOutputFiles) {
LxNxExcelWriter.writetoFile(duplicateFile, duplicateList);
LxNxExcelWriter.writetoFile(uniqueFile, uniqueList);
if (getUniqueAllTextFile() == null) {
setUniqueAllTextFile(toDirectory + fileSeperator + "AllUniqueText.txt");
}
File allTextFile = new File(getUniqueAllTextFile());
for (LxNxMetadata data : uniqueList) {
File fromFile = new File(toDirectory + fileSeperator + data.getBestDate() + "_" + data.getTextID() + ".txt");
File toFile = new File(uniqueDir + fileSeperator + data.getBestDate() + "_" + data.getTextID() + ".txt");
copyFile(fromFile, toFile);
if (getTextBodyUniqueDirectory() != null) {
copyAndRefineFile(uniqueDir + fileSeperator + data.getBestDate() + "_" + data.getTextID() + ".txt", getTextBodyUniqueDirectory() + fileSeperator + data.getBestDate() + "_" + data.getTextID() + ".txt");
if (getUniqueAllTextFile() != null) {
try {
String file1Str = FileUtils.readFileToString(new File(getTextBodyUniqueDirectory() + fileSeperator + data.getBestDate() + "_" + data.getTextID() + ".txt"));
FileUtils.write(allTextFile, file1Str, true);
} catch (IOException ex) {
Exceptions.printStackTrace(ex);
}
}
}
}
}
//storeSubjectsPerCorpus();
}
private void removeDuplicatedTextsFromDB() {
int max_domain = 20;
for (int i = 0; i < originalList.size(); i++) {
final int textID = Integer.parseInt(originalList.get(i).getTextID());
textIDMap.put(textID, originalList.get(i));
docGraph.put(textID, new ArrayList<Integer>());
}
for (int i = 0; i < originalList.size(); i++) {
for (int j = 1; j < max_domain; j++) {
int ind = i - j;
if (ind < 0) {
break;
}
if (isSimilar(originalList.get(i), originalList.get(ind))) {
int iID = Integer.parseInt(originalList.get(i).getTextID());
int indID = Integer.parseInt(originalList.get(ind).getTextID());
docGraph.get(iID).add(indID);
docGraph.get(indID).add(iID);
}
}
}
for (Integer docID : docGraph.keySet()) {
mark.put(docID, false);
}
for (Integer docID : docGraph.keySet()) {
if (mark.get(docID).equals(false)) {
duplicateMap.put(docID, new ArrayList<Integer>());
dfs(docID, docID);
}
}
if (genOutputFiles) {
initializeDir(duplicateDirectory);
}
for (Integer docID : duplicateMap.keySet()) {
uniqueList.add(textIDMap.get(docID));
// duplicateList.add(textIDMap.get(docID));
if (duplicateMap.get(docID).size() > 1) {
for (Integer others : duplicateMap.get(docID)) {
final LxNxMetadata otherObj = textIDMap.get(others);
duplicateList.add(otherObj);
if (!docID.equals(others)) {
if (genOutputFiles) {
File fromFile = new File(toDirectory + fileSeperator + otherObj.getBestDate() + "_" + otherObj.getTextID() + ".txt");
File toFile = new File(duplicateDirectory + fileSeperator + otherObj.getBestDate() + "_" + otherObj.getTextID() + ".txt");
copyFile(fromFile, toFile);
if (getTextBodyDuplicateDirectory() != null) {
copyAndRefineFile(duplicateDirectory + fileSeperator + otherObj.getBestDate() + "_" + otherObj.getTextID() + ".txt", getTextBodyDuplicateDirectory() + fileSeperator + otherObj.getBestDate() + "_" + otherObj.getTextID() + ".txt");
}
}
}
}
}
}
}
private void dfs(Integer root, Integer docID) {
mark.put(docID, true);
duplicateMap.get(root).add(docID);
originalList.get(docID).setRefDoc(root + "");
for (Integer other : docGraph.get(docID)) {
if (mark.get(other).equals(false)) {
dfs(root, other);
}
}
}
private boolean isValid(String str) {
if (str != null && str.length() > 1) {
return true;
}
return false;
}
private boolean isEqual(String str1, String str2) {
if (isValid(str1) && isValid(str2)) {
if (str1.equals(str2)) {
return true;
}
}
return false;
}
private boolean isSimilar(LxNxMetadata d1, LxNxMetadata d2) {
double totalDiff = 0;
double threshold = 15;
if (isEqual(d1.getTitle(), d2.getTitle())) {
return true;
}
totalDiff += DistanceUtil.editDistance(d1.getSubject(), d2.getSubject());
totalDiff += DistanceUtil.editDistance(d1.getAuthor(), d2.getAuthor());
//totalDiff += DistanceUtil.editDistance(d1.getBestDate(), d2.getBestDate());
totalDiff += DistanceUtil.editDistance(d1.getPubType(), d2.getPubType());
totalDiff += DistanceUtil.editDistance(d1.getSource(), d2.getSource());
// totalDiff += DistanceUtil.editDistance(d1.getTitle(), d2.getTitle());
if (Math.abs(d1.getCleanLength() - d2.getCleanLength()) > 10) {
totalDiff += 2;
}
if (totalDiff < threshold) {
return true;
}
return false;
}
private void copyFile(File fromFile, File toFile) {
try {
FileUtils.copyFile(fromFile, toFile);
} catch (IOException ex) {
Logger.getLogger(LxNxDataProvider.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void processTexts() {
try {
File d = new File(fromDirectory);
int counter = 0;
if (d.isDirectory()) {
File[] fs = d.listFiles();
for (int i = 0; i < fs.length; i++) {
counter = counter + 1;
File f = fs[i];
System.out.println(this.getClass().getName() + ".processTexts() file: " + f.getAbsolutePath());
year = f.getName();
System.out.println("year" + year);
year = year.substring(0, year.length() - 7);
if (f.getName().toLowerCase().endsWith(".txt")) {
if (f.length() != 0) {
this.extractContentFromFile(f);
//System.out.print(counter+"/"+fs.length +" ");
}
}
}
System.out.println(counter);
}
} catch (Exception e) {
System.out.println(this.getClass().getName()
+ ".LexisDataToDB() error: " + e);
e.printStackTrace();
}// END catch
}
/**
*
* @param dir
*/
public static void initializeDirSafe(String dir) {
try {
FileUtils.forceMkdir(new File(dir));
} catch (IOException ex) {
Logger.getLogger(LxNxDataProvider.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**
*
* @param dir
*/
public static void initializeDir(String dir) {
try {
FileUtils.deleteDirectory(new File(dir));
FileUtils.forceMkdir(new File(dir));
} catch (IOException ex) {
Logger.getLogger(LxNxDataProvider.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void extractContentFromFile(File f) {
try {
System.out.print("extractContentFromFile:" + f.getName());
//FileReader fr = new FileReader(f);
FileInputStream fis = new FileInputStream(f);
InputStreamReader streamReader = new InputStreamReader(fis, "UTF-8");
LineNumberReader lnr = new LineNumberReader(streamReader);
String line = "";
StringBuffer sb = new StringBuffer();
int textCounter = 0;
boolean beforeArticlesPart = true;
while ((line = lnr.readLine()) != null) {
if (line != null && line.contains(" DOCUMENTS")) {
// System.out.print(line.trim() + "; " + f.getName());
beforeArticlesPart = false;
}
if (beforeArticlesPart) {
continue;
}
// System.out.println("line: "+line);
if (line.contains(" DOCUMENTS") && line.contains(" of ")) {
// frage
// sb.append(line);
// System.out.println("sb: "+sb.toString());
String text = sb.toString().trim();
if (text.length() > 0) {
// System.out.println(this.getClass().getName()+".extractContentFromFile() text2:"+text);
parseText(text);
textCounter++;
}
sb = new StringBuffer();
} else {
// System.out.println("else line:"+line);
sb.append(line);
sb.append("\n");
}
}
String text = sb.toString().trim();
if (text.length() > 0) {
parseText(text);
textCounter++;
}
System.out.println(" textCounter: " + textCounter);
// System.out.println(this);
lnr.close();
} catch (Exception e) {
System.out.println(this.getClass().getName()
+ ".extractContentFromFile() error: " + e);
e.printStackTrace();
}// END catch
}
private void parseText(String s) {
try {
int pos = s.indexOf("\n");
String firstContentLine = "";
if (pos > 0) {
s.substring(0, pos);
} else {
firstContentLine = s;
}
// System.out.println("s: "+s);
LxNxTextParser tp = new LxNxTextParser(s, "" + this.idCounter);
HashSet<String> hsCharacteristicStrings = new HashSet<String>();
TextDataSet tds = new TextDataSet();
tds.id = tp.getTextID();
tds.pubDate = tp.getBestDate();
tds.title = tp.getTitle();
//this is the switch to consider or ignore duplicates at the time of populating the database and saving the clean text files
boolean includeDuplicates = true;
if (includeDuplicates || !hsCharacteristicStrings.contains(tds.getCharacteristicString())) {
hsCharacteristicStrings.add(tds.getCharacteristicString());
idCounter = idCounter + 1;
if (genOutputFiles) {
this.storeTexts(s, tp.getTextID(), tp.getBestDate());
}
// test for output lenght
// tp.cleanLength();
// tp.getDateAsDate();
HashSet<String> subjectsPerText = tp.getSubjectsPerText();
for (String s2 : subjectsPerText) {
// figure out details about text formatting
char c = s2.toCharArray()[0];
boolean b = Character.isLetterOrDigit(c);
if (!b) {
// System.out.println("character:"+c+"---"+s2+" "+(c=='
// ')+Character.getNumericValue(c));
}
s2 = s2.trim();
if (disregardTextIdList || this.hsIdsOfConsideredTexts.contains(tp.getTextID())) {
Integer existingValue = this.htSubjectsPerData.get(s2);
if (existingValue == null) {
this.htSubjectsPerData.put(s2, 1);
} else {
this.htSubjectsPerData.put(s2, existingValue + 1);
}
}
}
if (false) {
// this is incomplete now
System.out.println("source: " + tp.getSource());
// System.out.println(s);
System.out.println("date: " + tp.getDate());
System.out.println("title: " + tp.getTitle());
System.out.println("author: " + tp.getAuthor());
System.out.println("section: " + tp.getSection());
System.out.println("length: " + tp.getLength());
System.out.println("length: " + tp.getSubject());
System.out.println("geo: " + tp.getGeo());
System.out.println("language: " + tp.getLanguage());
System.out.println("loadDate: " + tp.getLoadDate());
System.out.println("org: " + tp.getOrganization());
System.out.println("pubType: " + tp.getPubType());
System.out.println("graphic: " + tp.getGraphic());
System.out.println("person: " + tp.getPerson());
System.out.println("company: " + tp.getCompany());
System.out.println("ticker: " + tp.getTicker());
System.out.println("industry: " + tp.getIndustry());
System.out.println("journal_code: " + tp.getJournalCode());
System.out.println("city: " + tp.getCity());
System.out.println("ID: " + tp.getTextID());
System.out
.println("\n---------------------------------------------------------");
}
// fill db
if (true) {
//System.out.println("hallo");
LxNxMetadata lxnxmetadata = new LxNxMetadata(tp);
originalList.add(lxnxmetadata);
}
// end fill db
}
} catch (Exception e) {
System.out.println("parseText() error: " + e);
e.printStackTrace();
}// END catch
}
private void storeTexts(String text, String id, String date) {
try {
FileWriter fw = new FileWriter(toDirectory + fileSeperator + date + "_" + id + ".txt");
PrintWriter pw = new PrintWriter(fw);
String[] lines = text.split("\n");
for (String line : lines) {
pw.println(line);
}
pw.close();
fw.close();
if (getTextBodyToDirectory() != null) {
copyAndRefineFile(toDirectory + fileSeperator + date + "_" + id + ".txt", getTextBodyToDirectory()
+ fileSeperator + date + "_" + id + ".txt");
}
} catch (Exception e3) {
System.out.println("storeTexts() error: " + e3);
e3.printStackTrace();
}// END catch
}
private void saveSelectedFilesAsPlainText(String fromDirectory, String toDirectory, String fileNameListFile) {
try {
File d = new File(toDirectory);
d.delete();
d = new File(toDirectory);
d.mkdirs();
HashSet<String> hsFileNames = new HashSet<String>();
FileReader fr = new FileReader(fileNameListFile);
LineNumberReader lnr = new LineNumberReader(fr);
String line = "";
while ((line = lnr.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
hsFileNames.add(line.toLowerCase());
}
}
lnr.close();
fr.close();
int numberOfFilesToCopy = hsFileNames.size();
int numberOfFilesToReview = 0;
System.out.println("number of files to copy:" + hsFileNames.size());
File fromDir = new File(fromDirectory);
File toDir = new File(toDirectory);
if (fromDir.isDirectory()) {
File[] fs = fromDir.listFiles();
numberOfFilesToReview = fs.length;
int filesToReviewCounter = 0;
int filesToCopyCounter = 0;
for (int i = 0; i < fs.length; i++) {
filesToReviewCounter++;
File f = fs[i];
System.out.println("testFileName:" + f.getName());
int startID = 0;
int endID = 0;
startID = f.getName().indexOf("_");
endID = f.getName().indexOf(".");
String idOfText = f.getName().substring(startID + 1, endID);
System.out.println("idOfText: " + idOfText);
if (hsFileNames.contains(idOfText)) {
filesToCopyCounter++;
System.out.println("copyFile:" + f.getName());
copyAndRefineFile(f.getAbsolutePath(), toDir + fileSeperator + f.getName());
if (filesToCopyCounter % 100 == 0) {
System.out.println("saveSelectedFilesAsPlainText: review: " + filesToReviewCounter + fileSeperator + numberOfFilesToReview + " " + filesToCopyCounter + "/" + numberOfFilesToCopy);
}
}
}
// System.out.println(counter);
}
} catch (Exception e3) {
System.out.println("creatSelectedFilesDirectory() error: " + e3);
e3.printStackTrace();
}// END catch
}
private void copyAndRefineFile(String fromFile, String toFile) {
try {
//System.out.println("copyAndRefineFile: "+toFile);
FileReader fr = new FileReader(fromFile);
LineNumberReader lnr = new LineNumberReader(fr);
String line = "";
StringBuffer sb = new StringBuffer();
while ((line = lnr.readLine()) != null) {
sb.append(line);
sb.append("\n");
}
lnr.close();
fr.close();
LxNxTextParser tp = new LxNxTextParser(sb.toString(), "");
String text = tp.getTextBody();
//System.out.println("sb.toString(): " + sb.toString());
//System.out.println("text: " + text);
File file = new File(toFile);
FileWriter fw = new FileWriter(file);
PrintWriter pw = new PrintWriter(fw);
String[] lines = text.split("\n");
for (String l : lines) {
pw.println(l);
}
pw.close();
fw.close();
} catch (FileNotFoundException ex) {
System.out
.println(ex.getMessage() + " in the specified directory.");
ex.printStackTrace();
} catch (IOException e) {
System.out.println(e.getMessage());
}
//
}
private void storeSubjectsPerCorpus() {
try {
FileWriter fw = new FileWriter(subectsOutputDir + "subjects2.csv");
PrintWriter pw = new PrintWriter(fw);
for (String s : htSubjectsPerData.keySet()) {
pw.println(s.trim() + "\t " + this.htSubjectsPerData.get(s));
}
pw.close();
fw.close();
} catch (Exception e4) {
System.out.println("storeSubjectsPerCorpus() error: " + e4);
e4.printStackTrace();
}// END catch
}
private void storeSubjectsPerSet() {
try {
HashSet<String> hsidsForSubject = this.getIdsOfTextsFromList();
FileWriter fw = new FileWriter(subectsOutputDir + "subjects2.csv");
PrintWriter pw = new PrintWriter(fw);
for (String s : htSubjectsPerData.keySet()) {
if (hsidsForSubject.contains(s)) {
pw.println(s.trim() + "\t " + this.htSubjectsPerData.get(s));
}
}
pw.close();
fw.close();
} catch (Exception e4) {
System.out.println("storeSubjectsPerCorpus() error: " + e4);
e4.printStackTrace();
}// END catch
}
private HashSet<String> getIdsOfTextsFromList() {
HashSet<String> hs = new HashSet<String>();
try {
//FileReader fr = new FileReader("C:\\myMind\\projects\\sudan\\ids_for_subject_after_sports.txt");
FileReader fr = new FileReader("C:\\myMind\\projects\\IAN_counterterrorism\\ids_for_subject_after_sports.txt");
LineNumberReader lnr = new LineNumberReader(fr);
String line = "";
while ((line = lnr.readLine()) != null) {
line = line.trim();
hs.add(line);
}
lnr.close();
fr.close();
} catch (Exception e5) {
System.out.println("storeSubjectsPerSet() error: " + e5);
e5.printStackTrace();
}// END catch
return hs;
}
private void removeDuplicatedTextsFromDB(String duplicateIdsFile) {
try {
//FileReader fr = new FileReader("C:\\myMind\\projects\\sudan\\id_duplicates.txt");
FileReader fr = new FileReader(duplicateIdsFile);
LineNumberReader lnr = new LineNumberReader(fr);
String line = "";
StringBuffer sb = new StringBuffer();
int numberOfLines = 0;
int counter = 0;
while ((line = lnr.readLine()) != null) {
numberOfLines++;
}
//fr = new FileReader("C:\\myMind\\projects\\sudan\\id_duplicates.txt");
fr = new FileReader(duplicateIdsFile);
lnr = new LineNumberReader(fr);
while ((line = lnr.readLine()) != null) {
line = line.trim();
//TODO : delete record in metadata file
//st.execute("DELETE from 6_moreRedundantsOut WHERE ID='" + line + "'");
System.out.println((counter++) + "/" + "numberOfLines: " + numberOfLines);
}
lnr.close();
fr.close();
} catch (Exception e5) {
System.out.println("removeDuplicatedTextsFromDB() error: " + e5);
e5.printStackTrace();
}// END catch
}
private void removeDuplicates() {
try {
//TODO : fix this for metadata
Connection con = null; /*this.getConnection();*/
PreparedStatement ps = con.prepareStatement("SELECT * FROM texts_orig");
ResultSet rs = ps.executeQuery();
Hashtable<String, Vector<TextDataSet>> htData = new Hashtable<String, Vector<TextDataSet>>();
HashSet<String> hsIdsOfDuplicates = new HashSet<String>();
while (rs.next()) {
TextDataSet tds = new TextDataSet();
String id = rs.getString("ID");
System.out.println(this.getClass().getName() + ".removeDuplicates() id " + id);
/*
tds.id = rs.getString("ID");
System.out.println(this.getClass().getName()+".removeDuplicates() tds.id "+tds.id);
tds.title = rs.getString("Title");
tds.pubDate="";
//System.out.println(this.getClass().getName()+".removeDuplicates() tds.id "+tds.id);
//tds.pubDate = rs.getString("PubDate");
Vector<TextDataSet> v = htData.get(tds.getCharacteristicString());
if(v==null){
v=new Vector<TextDataSet>();
v.add(tds);
if(tds.id!=null && tds.pubDate!=null && tds.title!=null){
htData.put(tds.getCharacteristicString(), v);
}
}else{
TextDataSet existingDataSet = v.get(0);
System.out.println(this.getClass().getName()+".removeDuplicates() douplicate id "+tds.id+" of existing id "+existingDataSet.id);
hsIdsOfDuplicates.add(tds.id);
}
*/
}
} catch (Exception e) {
System.out.println(this.getClass().getName() + ".removeDuplicates() error:" + e);
e.printStackTrace();
}
}
private void addToGraph(List<CodebookEntity> entityList, Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> graph) {
addToGraph(entityList, graph, 75);
}
private void addToGraph(List<CodebookEntity> entityList, Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> graph, int threshold_) {
for (CodebookEntity first : entityList) {
for (CodebookEntity sec : entityList) {
if (first.getName().compareTo(sec.getName()) > 0) {
if ((first.getPercent() > threshold_) && (sec.getPercent() > threshold_)) {
if (!graph.containsKey(first)) {
graph.put(first, new ArrayList<MyPair<CodebookEntity, Integer>>());
}
int index = getPairIndex(first, sec, graph);
if (index != -1) {
int weigth = graph.get(first).get(index).getSecond();
graph.get(first).get(index).setSecond(weigth + 1);
} else {
graph.get(first).add(new MyPair<CodebookEntity, Integer>(sec, 1));
}
}
}
}
}
}
private void addToGraph(List<CodebookEntity> entityList1, List<CodebookEntity> entityList2, Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> graph, int threshold_) {
for (CodebookEntity first : entityList1) {
for (CodebookEntity sec : entityList2) {
if ((first.getPercent() > threshold_) && (sec.getPercent() > threshold_)) {
if (!graph.containsKey(first)) {
graph.put(first, new ArrayList<MyPair<CodebookEntity, Integer>>());
}
int index = getPairIndex(first, sec, graph);
if (index != -1) {
int weigth = graph.get(first).get(index).getSecond();
graph.get(first).get(index).setSecond(weigth + 1);
} else {
graph.get(first).add(new MyPair<CodebookEntity, Integer>(sec, 1));
}
}
}
}
// System.out.println("addToGraph: entity#"+ entityList.size()+" weight#: "+totalWeight );
}
private Integer getPairIndex(CodebookEntity first, CodebookEntity sec, Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> graph) {
for (int i = 0; i < graph.get(first).size(); i++) {
MyPair<CodebookEntity, Integer> pair = graph.get(first).get(i);
if (pair.getFirst().equals(sec)) {
return i;
}
}
return -1;
}
/**
*
* @return
*/
public String getTextBodyToDirectory() {
return textBodyToDirectory;
}
/**
*
* @param textBodyToDirectory
*/
public void setTextBodyToDirectory(String textBodyToDirectory) {
this.textBodyToDirectory = textBodyToDirectory;
}
/**
*
* @return
*/
public String getTextBodyDuplicateDirectory() {
return textBodyDuplicateDirectory;
}
/**
*
* @param textBodyDuplicateDirectory
*/
public void setTextBodyDuplicateDirectory(String textBodyDuplicateDirectory) {
this.textBodyDuplicateDirectory = textBodyDuplicateDirectory;
}
/**
*
* @return
*/
public String getTextBodyUniqueDirectory() {
return textBodyUniqueDirectory;
}
/**
*
* @param textBodyUniqueDirectory
*/
public void setTextBodyUniqueDirectory(String textBodyUniqueDirectory) {
this.textBodyUniqueDirectory = textBodyUniqueDirectory;
}
/**
*
* @return
*/
public String getUniqueAllTextFile() {
return uniqueAllTextFile;
}
/**
*
* @param uniqueAllTextFile
*/
public void setUniqueAllTextFile(String uniqueAllTextFile) {
this.uniqueAllTextFile = uniqueAllTextFile;
}
/**
*
* @return
*/
public String getGexfAllNetworkFile() {
return gexfAllNetworkFile;
}
/**
*
* @param gexfAllNetworkFile
*/
public void setGexfAllNetworkFile(String gexfAllNetworkFile) {
this.gexfAllNetworkFile = gexfAllNetworkFile;
}
/**
*
* @return
*/
public String getGexfPersonNetworkFile() {
return gexfPersonNetworkFile;
}
/**
*
* @param gexfPersonNetworkFile
*/
public void setGexfPersonNetworkFile(String gexfPersonNetworkFile) {
this.gexfPersonNetworkFile = gexfPersonNetworkFile;
}
/**
*
* @return
*/
public String getGexfSubjectNetworkFile() {
return gexfSubjectNetworkFile;
}
/**
*
* @param gexfSubjectNetworkFile
*/
public void setGexfSubjectNetworkFile(String gexfSubjectNetworkFile) {
this.gexfSubjectNetworkFile = gexfSubjectNetworkFile;
}
/**
*
* @param type1
* @param mdata
* @return
*/
public List<CodebookEntity> getCodebookEntities(MetadataType type1, LxNxMetadata mdata) {
List<CodebookEntity> type_cbs = null;
if (type1.equals(MetadataType.PERSON)) {
type_cbs = CodebookEntity.parseLine(mdata.getPerson(), "agent", "specific", mdata.getTextID());
} else if (type1.equals(MetadataType.LOCATION)) {
type_cbs = CodebookEntity.parseLine(mdata.getGeo(), "location", "specific", mdata.getTextID());
} else if (type1.equals(MetadataType.ORGANIZATION)) {
type_cbs = CodebookEntity.parseLine(mdata.getOrganization(), "organization", "specific", mdata.getTextID());
} else if (type1.equals(MetadataType.SUBJECT)) {
type_cbs = CodebookEntity.parseLine(mdata.getSubject(), "knowledge", "specific", mdata.getTextID());
} else {
System.out.println("Invalid type in getCodebookEntities");
}
return type_cbs;
}
private void writeGraphToCSV(Map<CodebookEntity, List<MyPair<CodebookEntity, Integer>>> graph, CSVWriter csvWriter) {
for (CodebookEntity node : graph.keySet()) {
for (MyPair<CodebookEntity, Integer> other : graph.get(node)) {
CodebookEntity node2 = other.getFirst();
float weight = other.getSecond();
csvWriter.append(node.getName() + "," + node2.getName() + "," + weight + "," + "Undirected");
}
}
}
}