package gov.nysenate.openleg.util;
import gov.nysenate.openleg.model.base.SessionYear;
import gov.nysenate.openleg.model.entity.Chamber;
import gov.nysenate.openleg.model.entity.SessionMember;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.stream.Collectors.toList;
/**
* Quick and dirty member scraping. This scraper will break with site redesigns so always test first.
*/
public class MemberScraperUtils
{
private static final Logger logger = LoggerFactory.getLogger(MemberScraperUtils.class);
static final String ASSEMBLY_MEMBER_DIR_URL = "http://assembly.state.ny.us/mem/";
static final String ASSEMBLY_MEMBER_LISTING_URL = "http://assembly.state.ny.us/mem/email/";
static final String SENATE_MEMBER_DIR_URL = "http://www.nysenate.gov/senators";
public static List<SessionMember> getAssemblyMembers() throws IOException {
Document directoryPage = Jsoup.connect(ASSEMBLY_MEMBER_DIR_URL).get();
Document listingPage = Jsoup.connect(ASSEMBLY_MEMBER_LISTING_URL).get();
Elements csvNameElements = listingPage.select(".email1 a");
Elements districtElements = listingPage.select(".email2");
Elements picElements = directoryPage.select(".mem-pic a img");
Elements fullNameElements = directoryPage.select(".leader-info strong a");
List<String> fullNames = fullNameElements.stream().map(e -> e.text()).collect(toList());
Pattern districtPattern = Pattern.compile("(\\d+)\\w+");
List<Integer> districts = districtElements.stream().map(e -> {
Matcher m = districtPattern.matcher(e.text());
m.matches();
return Integer.parseInt(m.group(1));
}).collect(toList());
List<String> lastNames = csvNameElements.stream().map(e -> e.text().split(",")[0]).collect(toList());
List<String> imageNames = picElements.stream().map(i -> i.attr("src")).collect(toList());
List<SessionMember> members = new ArrayList<>();
for (int i = 0; i < lastNames.size(); i++) {
SessionMember m = new SessionMember();
m.setLastName(lastNames.get(i));
m.setFullName(fullNames.get(i));
m.setImgName(imageNames.get(i));
m.setDistrictCode(districts.get(i));
m.setSessionYear(SessionYear.current());
m.setChamber(Chamber.ASSEMBLY);
members.add(m);
}
return members;
}
public static List<SessionMember> getSenateMembers() throws IOException {
Document directoryPage = Jsoup.connect(SENATE_MEMBER_DIR_URL).get();
Elements nodes = directoryPage.select(".view-senators .view-content .views-row");
Elements imageElems = nodes.select(".views-field-field-profile-picture-fid img");
Elements nameElems = nodes.select(".views-field-field-last-name-value .field-content > a");
Elements districtElems = nodes.select(".views-field-field-senators-district-nid span");
List<String> imageUrls = imageElems.stream()
.map(i -> i.attr("src")).collect(toList());
List<String> names = nameElems.stream().map(n -> n.text()).collect(toList());
Pattern districtPattern = Pattern.compile("District (\\d+)");
List<Integer> districts = districtElems.stream().map(d -> {
Matcher m = districtPattern.matcher(d.text());
m.find();
return Integer.parseInt(m.group(1));
}).collect(toList());
List<SessionMember> senators = new ArrayList<>();
for (int i = 0; i < names.size(); i++) {
SessionMember m = new SessionMember();
String[] splitName = names.get(i).split(",");
m.setLastName(splitName[0]);
m.setFullName(splitName[1] + " " + splitName[0]);
m.setImgName(imageUrls.get(i));
m.setDistrictCode(districts.get(i));
m.setSessionYear(SessionYear.current());
m.setChamber(Chamber.SENATE);
senators.add(m);
}
return senators;
}
}