package edu.usc.cssl.tacit.crawlers.senate.services; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import org.eclipse.core.runtime.IProgressMonitor; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import edu.usc.cssl.tacit.common.ui.views.ConsoleView; public class SenateCrawler { public int totalFilesDownloaded = 0; ArrayList<Integer> congresses = new ArrayList<Integer>(); String dateFrom, dateTo; int maxDocs = 10; String outputDir; BufferedWriter csvWriter; String sortType; HashSet<String> irrelevantLinks = new HashSet<String>(Arrays.asList("Next Document","New CR Search","Prev Document","HomePage","Help","GPO's PDF")); private ArrayList<String> selectedSenators; private int congressNum; IProgressMonitor monitor; int progressSize; HashMap<String, HashMap<String, String>> congressSenatorMap = AvailableRecords.getCongressSenatorMap(); HashMap<String, String> senatorDetails = SenatorDetails.getSenatorDetails(); // to populate all senator details private void formatSenatorList() { ArrayList<String> tempSenators = new ArrayList<String>(); for(String senator : this.selectedSenators) { tempSenators.add(senator); } if(tempSenators.contains("All Senators")) { selectedSenators.removeAll(selectedSenators); selectedSenators.add("All Senators"); } else { if(tempSenators.contains("All Democrats")) { // remove all the remaining democrats for (Iterator<String> it = tempSenators.iterator(); it.hasNext(); ) { String s = it.next(); if(s.contains("(D-") || s.contains("D/")) { it.remove(); } } } if(tempSenators.contains("All Republicans")) { for (Iterator<String> it = tempSenators.iterator(); it.hasNext(); ) { String s = it.next(); if(s.contains("(R-") || s.contains("R/")) { it.remove(); } } } if(tempSenators.contains("All Independents")) { for (Iterator<String> it = tempSenators.iterator(); it.hasNext(); ) { String s = it.next(); if(s.contains("(I-") || s.contains("I/")) { it.remove(); } } } } this.selectedSenators = tempSenators; //System.out.println("Senator details after formatting :" + selectedSenators); } public void crawl() throws IOException { if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } formatSenatorList(); DateFormat df = new SimpleDateFormat("MM-dd-yyyy-HH-mm-ss"); Date dateobj = new Date(); csvWriter = new BufferedWriter(new FileWriter(new File(outputDir + System.getProperty("file.separator") + "senate-crawler-summary-"+df.format(dateobj)+".csv"))); csvWriter.write("Congress,Date,Senator,Political Affiliation,State,Title,File"); csvWriter.newLine(); for(String senText : selectedSenators) { int tempProgressSize = progressSize/selectedSenators.size(); //if (senText.contains("All Senators") || senText.contains("All Republicans") || senText.contains("All Democrats") || senText.contains("All Independents")){ if (senText.equals("All Senators") || senText.equals("All Republicans") || senText.equals("All Democrats") || senText.equals("All Independents")){ if (congressNum != -1) { if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } getAll(congressNum, senText, tempProgressSize); }else { for (int congress : congresses) { if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } getAll(congress, senText, tempProgressSize/congresses.size()); } } if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } } else { String politicalAffiliation = ""; if(senText.lastIndexOf('(')!=-1) { String affiliation = senText.substring(senText.lastIndexOf('(')+1, senText.length()-1); if(-1 != affiliation.indexOf('-')) politicalAffiliation = affiliation.split("-")[0]; else politicalAffiliation = senatorDetails.get(senText).split("-")[0]; } if (congressNum == -1) { // All congress for (int congress: congresses) { if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } /*if(monitor!=null) monitor.worked(80/congresses.size()); */ System.out.println("Extracting Records from Congress "+congress+"..."); String senatorName = congressSenatorMap.get(String.valueOf(congress)).get(senText); if(null != senatorName) { searchSenatorRecords(congress, senatorName, tempProgressSize/congresses.size(), politicalAffiliation); } } } else { if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } System.out.println("Extracting Records from Congress "+congressNum+"..."); String senatorName = congressSenatorMap.get(String.valueOf(congressNum)).get(senText); if(null != senatorName) { searchSenatorRecords(congressNum, senatorName, tempProgressSize, politicalAffiliation); } } if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } } } csvWriter.close(); } public void getAll(int congressNum, String senText, int maxProgressLimit) throws IOException{ /* ConsoleView.printlInConsoleln("Extracting Senators of Congress "+congressNum); Document doc = Jsoup.connect("http://thomas.loc.gov/home/LegislativeData.php?&n=Record&c="+congressNum).timeout(10*1000).get(); Elements senList = doc.getElementsByAttributeValue("name", "SSpeaker").select("option"); */ boolean foundSenator = false; for (String senator : congressSenatorMap.get(String.valueOf(congressNum)).keySet()) { String senatorName = senator; if(null != monitor && monitor.isCanceled()) return; //String senator = senItem.text().replace("\u00A0", " "); if (senator.contains("Any Senator")) // We just need the senator names continue; if (senText.contains("All Republicans")){ if (!senatorName.contains("(R-") && !senatorName.contains("R/")) continue; } if (senText.contains("All Democrats")){ if (!senatorName.contains("(D-") && !senatorName.contains("D/")) continue; } if (senText.contains("All Independents")){ if (!senatorName.contains("(I-") && !senatorName.contains("I/")) continue; } String politicalAffiliation = ""; if(senatorName.lastIndexOf('(')!=-1) { String affiliation = senatorName.substring(senatorName.lastIndexOf('(')+1, senatorName.length()-1); if(-1 != affiliation.indexOf('-')) politicalAffiliation = affiliation.split("-")[0]; else politicalAffiliation = senatorDetails.get(senatorName).split("-")[0]; } if(null != congressSenatorMap.get(String.valueOf(congressNum)).get(senator)) { searchSenatorRecords(congressNum, congressSenatorMap.get(String.valueOf(congressNum)).get(senator), maxProgressLimit/congressSenatorMap.get(String.valueOf(congressNum)).keySet().size(), politicalAffiliation); foundSenator = true; } } if(!foundSenator){ if(senText.contains("All Republicans")) { ConsoleView.printlInConsoleln("No republicans found"); } else if(senText.contains("All Democrats")) { ConsoleView.printlInConsoleln("No democrats found"); } else if(senText.contains("All Independents")) { ConsoleView.printlInConsoleln("No independents found"); }else { ConsoleView.printlInConsoleln("No senators found"); } } } public void initialize(String sortType, int maxDocs, int congressNum, ArrayList<String> senatorDetails, String dateFrom, String dateTo, String outputDir, ArrayList<Integer> allCongresses, IProgressMonitor monitor, int progressSize) throws IOException { this.outputDir = outputDir; this.maxDocs = maxDocs; this.dateFrom = dateFrom; this.dateTo = dateTo; this.selectedSenators = senatorDetails; this.congressNum = congressNum; this.sortType = sortType; this.congresses = allCongresses; this.monitor = monitor; this.progressSize = progressSize; /*System.out.println("Congress num :"+ congressNum); System.out.println("Senator name :"+ senatorDetails); System.out.println("Max docs :"+ maxDocs); System.out.println("Sort Type : "+ sortType); System.out.println("From date :"+ dateFrom); System.out.println("To Date: "+ dateTo); System.out.println("Progress Size :"+ progressSize);*/ if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } } public void searchSenatorRecords(int congress,String senText, int progressSize, String politicalAffiliation) throws IOException, NullPointerException{ ConsoleView.printlInConsoleln("Current Senator - "+senText); if(null == senText) return; String senatorDir = this.outputDir + File.separator + senText; if(!new File(senatorDir).exists()) { new File(senatorDir).mkdir(); } if(null != monitor && !monitor.isCanceled()) { monitor.subTask("Crawling data for " + senText + "..."); } Document doc = Jsoup.connect("http://thomas.loc.gov/cgi-bin/thomas2") .data("xss","query") // Important. If removed, "301 Moved permanently" error .data("queryr"+congress,"") // Important. 113 - congress number. Make this auto? If removed, "Database Missing" error .data("MaxDocs","2000") // Doesn't seem to be working .data("Stemming","No") .data("HSpeaker","") .data("SSpeaker",senText) .data("member","speaking") // speaking | all -- all occurrences .data("relation","or") // or | and -- when there are multiple speakers in the query .data("SenateSection","1") //.data("HouseSection","2") //.data("ExSection","4") //.data("DigestSection","8") .data("LBDateSel","Thru") // "" | 1st | 2nd | Thru -- all sessions, 1st session, 2nd session, range .data("DateFrom",dateFrom) .data("DateTo",dateTo) .data("sort",sortType) // Default | Date .data("submit","SEARCH") .userAgent("Mozilla") .timeout(10*1000) .post(); Elements links = doc.getElementById("content").getElementsByTag("a"); // Extracting the relevant links Elements relevantLinks = new Elements(); for (Element link:links){ if (!irrelevantLinks.contains(link.text())) if (link.text().contains("Senate")) relevantLinks.add(link); } if (relevantLinks.size() == 0){ ConsoleView.printlInConsoleln("No Records Found."); return; } links = relevantLinks; String senatorAttribs = senText.split("\\(")[1].replace(")", "").trim(); String senatorState = senatorAttribs; if(-1 != senatorAttribs.indexOf('-')) { senatorState = senatorAttribs.split("-")[1]; } int count = 0; int tempCount = 0; // Process each search result for (Element link : links) { if(null != monitor && monitor.isCanceled()) { monitor.subTask("Cancelling.. "); return; } if (maxDocs==-1) count=-2000; if (count++>=maxDocs) break; String recordDate = link.text().replace("(Senate - ", "").replace(",", "").replace(")", "").trim(); Document record = Jsoup.connect("http://thomas.loc.gov"+link.attr("href")).timeout(10*1000).get(); Elements tabLinks = record.getElementById("content").select("a[href]"); String extractLink=""; // Find Printer Friendly Display for (Element tabLink:tabLinks) { if (tabLink.text().equals("Printer Friendly Display")) { extractLink = tabLink.attr("href"); break; } } String lastName = senText.split(",")[0]; String[] contents = extract(extractLink,lastName); if (contents[1].length()==0) count--; else { String[] split = contents[0].split("-"); String title = split[0].trim(); title = title.replaceAll(",", ""); title = title.replaceAll("\\.", ""); String shortTitle = title; if (title.length()>15) shortTitle = title.substring(0, 15).trim().replaceAll("[^\\w\\s]", ""); String fileName = congress+"-"+lastName+"-"+senatorAttribs+"-"+recordDate+"-"+shortTitle+"-"+(System.currentTimeMillis()%1000)+".txt"; writeToFile(senatorDir, fileName, contents); csvWriter.write(congress+","+recordDate+","+lastName+","+politicalAffiliation+","+senatorState+","+title+","+fileName); csvWriter.newLine(); csvWriter.flush(); } tempCount++; tempCount = updateWork(maxDocs, links.size(), progressSize, tempCount); } } private int updateWork(int maxDocs, int totalLinks, int progressSize, int tempCount) { int tempMaxDocs = maxDocs == -1 ? 2000 : maxDocs; int numDocs2Download = tempMaxDocs > totalLinks ? totalLinks : tempMaxDocs; if(numDocs2Download>progressSize && 0!= progressSize) { // worked should be 1 int totalCount = numDocs2Download/progressSize; totalCount++; if(tempCount % totalCount == 0) { tempCount = 0; monitor.worked(1); } } else { monitor.worked((progressSize/numDocs2Download)-1); } return tempCount; } private void writeToFile(String senatorOutputDir, String fileName, String[] contents) throws IOException { //ConsoleView.printlInConsoleln("Writing senator data - "+fileName); ConsoleView.printlInConsoleln("Writing "+ senatorOutputDir + File.separator + fileName); BufferedWriter bw = new BufferedWriter(new FileWriter(new File(senatorOutputDir+System.getProperty("file.separator")+fileName))); bw.write(contents[0]); bw.newLine(); bw.newLine(); bw.write(contents[1]); bw.close(); totalFilesDownloaded++; } private String[] extract(String extractLink, String lastName) throws IOException { Document page = Jsoup.connect("http://thomas.loc.gov"+extractLink).timeout(10*1000).get(); String title = page.getElementById("container").select("b").text(); StringBuilder content = new StringBuilder(); Elements lines = page.getElementById("container").select("p"); String currentLine; boolean extractFlag = false; for (Element line : lines) { currentLine = line.text().trim(); if (currentLine!=null && !currentLine.isEmpty()){ String[] words = currentLine.replaceAll("\u00A0", "").trim().split(" "); if (words.length>1){ String currentName = words[1].trim().replace(".", ""); // Check the second word of the sentence. currentName = currentName.replace(",", ""); String firstWord = words[0].trim().replace(".", ""); if (currentName.equals(lastName.toUpperCase())) { // Found senator dialogue extractFlag = true; content.append(currentLine.replace("\u00A0", "").trim()+"\n"); } else { // If first word is uppercase too, stop extracting. if (firstWord.length()<=1 && !firstWord.equals(firstWord.toUpperCase()) ){ extractFlag = false; } // If "I", continue extracting. if (!currentName.equals("I") && !isNumeric(currentName) && currentName.equals(currentName.toUpperCase())){ // Next speaker. extractFlag = false; } // if already extracting, continue until end of file or until next speaker's dialogue if (extractFlag) content.append(currentLine.replace("\u00A0", "").trim()+"\n"); } } } } String[] contents = new String[2]; contents[0] = title; contents[1] = content.toString(); return contents; } private boolean isNumeric(String word){ try { Integer.parseInt(word); return true; } catch (NumberFormatException e) { return false; } } }