package se.kodapan.osm.sweden.ext.se.posten.postnummer; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.cyberneko.html.parsers.DOMParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import javax.xml.xpath.*; import java.io.*; import java.net.URLEncoder; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author kalle * @since 2013-01-02 00:07 */ public class PostenPostnummerService { private static Logger log = LoggerFactory.getLogger(PostenPostnummerService.class); public PostenPostnummerService() throws Exception { } public void open() throws Exception { } public void close() throws Exception { } private static String defaultUserAgent = "Unnamed instance of " + PostenPostnummerService.class.getName() + ", https://github.com/karlwettin/osm-common/"; private String userAgent = defaultUserAgent; private HttpClient client = new DefaultHttpClient(); private DOMParser p = new DOMParser(); private XPath xpath = XPathFactory.newInstance().newXPath(); private XPathExpression gatunamnExpression = xpath.compile("parent::TR/TD[1]"); private XPathExpression husnummerserieExpression = xpath.compile("parent::TR/TD[2]"); private XPathExpression postnummerExpression = xpath.compile("//TBODY/TR/TD[3]"); private XPathExpression postortExpression = xpath.compile("parent::TR/TD[4]"); private Pattern husnummerseriePattern = Pattern.compile("([0-9]+)\\W*-\\W*([0-9]+)"); public PostenPostnummerServiceResponse queryStreetAddress(String streetAddress, String city) throws Exception { StringBuilder url = new StringBuilder(); url.append("http://www.posten.se/soktjanst/postnummersok/resultat.jspv?"); if (streetAddress != null) { url.append("gatunamn="); url.append(URLEncoder.encode(streetAddress, "iso-8859-1").replaceAll("\\+", "%20")); url.append("&"); } if (city != null) { url.append("po="); url.append(URLEncoder.encode(city, "iso-8859-1").replaceAll("\\+", "%20")); url.append("&"); } HttpGet get = new HttpGet(url.toString()); log.info("Executing street address query: " + url); return parseResponse(get); } public PostenPostnummerServiceResponse queryPostnummer(String postnummer) throws Exception { StringBuilder url = new StringBuilder(); url.append("http://www.posten.se/soktjanst/postnummersok/resultat.jspv?"); url.append("pnr="); url.append(URLEncoder.encode(postnummer, "iso-8859-1")); url.append("&"); HttpGet get = new HttpGet(url.toString()); log.info("Executing postnummer query: " + url); return parseResponse(get); } private PostenPostnummerServiceResponse parseResponse(HttpGet get) throws IOException, SAXException, XPathExpressionException { if (defaultUserAgent.equals(userAgent)) { throw new NullPointerException("PostenPostnummerService HTTP header User-Agent not set!"); } get.setHeader("User-Agent", userAgent); HttpResponse httpResponse = client.execute(get); StringWriter buffer = new StringWriter(); InputStream httpResponseStream = httpResponse.getEntity().getContent(); try { IOUtils.copy(new InputStreamReader(httpResponseStream, "iso-8859-1"), buffer); } finally { httpResponse.getEntity().getContent().close(); } log.debug("Postnummer response received"); List<PostenPostnummerServiceRecord> records = new ArrayList<PostenPostnummerServiceRecord>(); PostenPostnummerServiceResponse response = new PostenPostnummerServiceResponse(records); if (buffer.toString().contains("Sökningen gav ett stort antal träffar")) { response.setLimitedResults(true); } p.setFeature("http://xml.org/sax/features/namespaces", false); p.parse(new InputSource(new StringReader(buffer.toString()))); NodeList results = (NodeList) postnummerExpression.evaluate(p.getDocument(), XPathConstants.NODESET); if (results.getLength() > 0) { for (int i = 0; i < results.getLength(); i++) { PostenPostnummerServiceRecord record = new PostenPostnummerServiceRecord(); records.add(record); record.setPostnummer(results.item(i).getTextContent().replaceAll("[^0-9]+", "")); Matcher matcher = husnummerseriePattern.matcher(husnummerserieExpression.evaluate(results.item(i))); if (matcher.find()) { record.setStarthusnummer(Integer.valueOf(matcher.group(1))); record.setSluthusnummer(Integer.valueOf(matcher.group(2))); } record.setGatunamn(gatunamnExpression.evaluate(results.item(i))); record.setPostort(postortExpression.evaluate(results.item(i))); } } log.debug("Postnummer response resolved"); return response; } private static char[] resolverAlphabet = new char[]{ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'å', 'ä', 'ö', }; /** * Creates a response that contains all records (house number segements of streets, poboxes etc) for a given postnummer. * @param postnummer * @return * @throws Exception */ public PostenPostnummerServiceResponse resolveAllServiceRecordsInPostnummer(String postnummer) throws Exception { PostenPostnummerServiceResponse response = queryPostnummer(postnummer); if (!response.getRecords().isEmpty() && response.isLimitedResults()) { Set<PostenPostnummerServiceRecord> records = new LinkedHashSet<PostenPostnummerServiceRecord>(1000); for (char character : resolverAlphabet) { resolvePostnummer(String.valueOf(character), records, response.getRecords().get(0).getPostort(), response.getRecords().get(0).getPostnummer()); } response = new PostenPostnummerServiceResponse(); response.setRecords(new ArrayList<PostenPostnummerServiceRecord>(records)); } return response; } private void resolvePostnummer(String streetNamePrefix, Set<PostenPostnummerServiceRecord> records, String postort, String postnummer) throws Exception { PostenPostnummerServiceResponse partsResponse = queryStreetAddress(streetNamePrefix.trim(), postort); for (PostenPostnummerServiceRecord record : partsResponse.getRecords()) { if (postnummer.equals(record.getPostnummer())) { records.add(record); } } if (partsResponse.isLimitedResults()) { for (char character : resolverAlphabet) { resolvePostnummer(streetNamePrefix + character, records, postort, postnummer); } if (!streetNamePrefix.endsWith(" ")) { resolvePostnummer(streetNamePrefix + " ", records, postort, postnummer); } } } public Set<String> gatherAllPostorter() throws Exception { return gatherAllPostnummerByPostorter().keySet(); } public Map<String, Set<String>> gatherAllPostnummerByPostorter() throws Exception { return gatherAllPostnummerByPostorter(10000, 99999); } public Map<String, Set<String>> gatherAllPostnummerByPostorter(int start, int end) throws Exception { log.info("Gathering all postnummer by postorter..."); Map<String, Set<String>> postnummerByPostorter = new HashMap<String, Set<String>>(); for (int i=start; i<=end; i++) { PostenPostnummerServiceResponse response = queryPostnummer(String.valueOf(i)); for (PostenPostnummerServiceRecord record : response.getRecords()) { Set<String> postnummer = postnummerByPostorter.get(record.getPostort()); if (postnummer == null) { postnummer = new HashSet<String>(); postnummerByPostorter.put(record.getPostort(), postnummer); } postnummer.add(record.getPostnummer()); } } return postnummerByPostorter; } public String getUserAgent() { return userAgent; } public void setUserAgent(String userAgent) { this.userAgent = userAgent; } }