/** * */ package uk.bl.wa.extract; /* * #%L * warc-indexer * $Id:$ * $HeadURL:$ * %% * Copyright (C) 2013 - 2014 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * http://en.wikipedia.org/wiki/Postcodes_in_the_United_Kingdom * * @author Andrew Jackson <Andrew.Jackson@bl.uk> * */ public class Postcodes { public static final String BS7666_APPROX = "[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][ABD-HJLNP-UW-Z]{2}"; public static final Pattern APPROX_PATTERN = Pattern.compile("[^A-Z0-9]("+BS7666_APPROX+")[^A-Z0-9]"); /** * This uses the approximate matcher to extract all text strings that are probably postcodes. * * @param source * @return */ public static String[] extractProbablePostcodes( String source ) { ArrayList<String> results = new ArrayList<String>(); Matcher m = APPROX_PATTERN.matcher(source); while (m.find()) { results.add(m.group()); } return (String[]) results.toArray(); } /** * This stricter match test takes longer, but validates by disallowing certain postcodes. * See http://stackoverflow.com/questions/5820820/regular-expression-in-c-sharp-uk-postcode * * @param postcode * @return */ public static boolean isPostCode (String postcode) { return ( Pattern.matches("(^[A-PR-UWYZa-pr-uwyz][0-9][ ]*[0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2}$)", postcode ) || Pattern.matches("(^[A-PR-UWYZa-pr-uwyz][0-9][0-9][ ]*[0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2}$)", postcode) || Pattern.matches("(^[A-PR-UWYZa-pr-uwyz][A-HK-Ya-hk-y][0-9][ ]*[0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2}$)", postcode) || Pattern.matches( "(^[A-PR-UWYZa-pr-uwyz][A-HK-Ya-hk-y][0-9][0-9][ ]*[0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2}$)", postcode) || Pattern.matches("(^[A-PR-UWYZa-pr-uwyz][0-9][A-HJKS-UWa-hjks-uw][ ]*[0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2}$)", postcode) || Pattern.matches("(^[A-PR-UWYZa-pr-uwyz][A-HK-Ya-hk-y][0-9][A-Za-z][ ]*[0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2}$)", postcode) || Pattern.matches("(^[Gg][Ii][Rr][]*0[Aa][Aa]$)", postcode) ); } }