package org.solrmarc.mixin; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.marc4j.marc.DataField; import org.marc4j.marc.Record; import org.marc4j.marc.Subfield; import org.marc4j.marc.VariableField; import org.solrmarc.index.SolrIndexer; import org.solrmarc.index.SolrIndexerMixin; import org.solrmarc.tools.DataUtil; public class RegionFacetMixin extends SolrIndexerMixin { final static String stateTable[][] = {{"Alabama", "Ala."}, {"Alaska", "Alaska"}, {"Arizona", "Ariz."}, {"Arkansas", "Ark."}, {"California", "Calif."}, {"Colorado", "Colo."}, {"Connecticut", "Conn."}, {"Delaware", "Del."}, {"Florida", "Fla."}, {"Georgia", "Ga."}, {"Hawaii", "Hawaii"}, {"Idaho", "Idaho"}, {"Illinois", "Ill."}, {"Indiana", "Ind."}, {"Iowa", "Iowa"}, {"Kansas", "Kan."},{"Kentucky", "Ky."}, {"Louisiana", "La."}, {"Maine", "Maine"}, {"Maryland", "Md."}, {"Massachusetts", "Mass."}, {"Michigan", "Mich."}, {"Minnesota", "Minn."}, {"Mississippi", "Miss."}, {"Missouri", "Mo."}, {"Montana", "Mont."}, {"Nebraska", "Neb."}, {"Nevada", "Nev."}, {"New Hampshire", "N.H."}, {"New Jersey", "N.J."}, {"New Mexico", "N.M."},{"New York", "N.Y."}, {"North Carolina", "N.C."}, {"North Dakota", "N.D."}, {"Ohio", "Ohio"}, {"Oklahoma", "Okla."}, {"Oregon", "Or."}, {"Pennsylvania", "Pa."}, {"Rhode Island", "R.I."}, {"South Carolina", "S.C."}, {"South Dakota", "S.D."},{"Tennessee", "Tenn."}, {"Texas", "Tex."}, {"Utah", "Utah"}, {"Vermont", "Vt."}, {"Virginia", "Va."}, {"Washington", "Wash."}, {"West Virginia", "W. Va."}, {"Wisconsin", "Wis."}, {"Wyoming", "Wyo."}, {"New York (State)", "N.Y."}, {"District of Columbia", "D.C."}, {"Puerto Rico", "P.R."}, {"Virgin Islands", "V.I."}, {"Alberta", "Alta."}, {"British Columbia", "B.C."}, {"Manitoba", "Man."}, {"Newfoundland and Labrador", "N.L."}, {"New Brunswick", "N.B."}, {"Northwest Territories", "N.W.T."}, {"Nova Scotia", "N.S."}, {"Nunavut", "Nunavut"}, {"Ontario", "Ont."}, {"Prince Edward Island", "P.E.I."}, {"Quebec", "Que'bec"}, {"Saskatoon", "Sask."}, {"Yukon", "Yukon"}, {"Australian Capital Territory", "A.C.T."}, {"New South Wales", "N.S.W."}, {"Northern Territory", "N.T."}, {"Queensland", "Qld."}, {"South Australia", "S. Aust."}, {"Tasmania", "Tas."}, {"Victoria", "Vic."}, {"Western Australia", "W.A." }}; static Map<String, String> stateMap = null; private String getStateNameAbbrev(String stateName) { if (stateMap == null) { stateMap = new LinkedHashMap<String, String>(); for (int i = 0; i < stateTable.length; i++) { stateMap.put(stateTable[i][0], stateTable[i][1]); } } return(stateMap.get(stateName)); } final static String locationTypeNames[] = {"State", "Republic", "Principality", "Province", "Township", "County", "Town", "Judicial district", "Prefecture", "Region", "District", "Dept.", "Kingdom", "Canton", "City", "Division", "Duchy", "Emirate", "Government", "Country", /* India: */ "Princely State", "Presidency", "Tahsil", "Taluka", "Tehsil", "Thana", /* China: */ "Sheng", /* Denmark: */ "Amt", "Herred", /* Thailand: */ "Amphoe", /* France: */ "Comte“", /* South/Central America: */ "Corregimiento", "Distrito Federal", "Intendancy", "Partido", /* Religious: */ "Diocese", "diocese", "Archdiocese", "Archdeaconry", "Ecclesiastical principality", /* Poland: */ "Voivodeship", "Powiat", /* Germany:*/ "Landkreis", "Kreis", "Bezirk", "Electorate", "Grafschaft", /* Czech: */ "Okres", /* Russia: */ "Oblast'", "Oblast", "Kray", /* Hungary: */ "Comitat", /* Romania: */ "Judet", /* Indonesia: */ "Kabupaten", /* Former: */ "Ancient city", "Ancient sanctuary", "Ancient site", "Extinct city", "Concentration camp", "Colony", "Site", /* Descriptive: */ "Peninsula", "Coast", "Cape", "Harbor", "Island", "Lake", "Oasis", "Tribal area" }; static Set<String> locationTypeNameSet = null; private static boolean isLocationTypeWord(String name) { if (locationTypeNameSet == null) { locationTypeNameSet = new LinkedHashSet<String>(); for (String locType : locationTypeNames) { locationTypeNameSet.add(locType); } } if (locationTypeNameSet.contains(name)) return(true); return(false); } private boolean isEqualsOrContains(String string1, String string2) { if (string1.equals(string2)) return(true); if (string1.contains(" and "+ string2)) return(true); if (string1.contains(string2 + " and ")) return(true); if (string1.contains(", "+ string2)) return(true); String tmp = getStateNameAbbrev(string2); if (tmp != null && tmp.equals(string1)) return(true); return(false); } private Set<String> getSet650z(Record record) { Set<String> result = new LinkedHashSet<String>(); List<VariableField> fields = (List<VariableField>)record.getVariableFields("650"); for (VariableField f : fields) { DataField df = (DataField)f; List<Subfield> sfs = (List<Subfield>)df.getSubfields(); boolean prevWasZ = false; StringBuffer part = new StringBuffer(); boolean hadPrevZ = false; int zCount = 0; for (Subfield sf : sfs) { if (sf.getCode() == 'z') { zCount++; if (zCount > 2) { prevWasZ = true; } if (prevWasZ) { String data = DataUtil.cleanData(sf.getData()); if (data.equals("South America") || data.equals("Central America") || data.equals("United States")) { //part.insert(0, data+"#"); } else { part.append("#"); part.append(DataUtil.cleanData(sf.getData())); } } else { if (hadPrevZ) { hadPrevZ = true; } part.append(DataUtil.cleanData(sf.getData())); } prevWasZ = true; hadPrevZ = true; } else { zCount = 0; if (prevWasZ) { result.add(part.toString()); part.setLength(0); prevWasZ = false; } } } if (prevWasZ) { result.add(part.toString()); part.setLength(0); prevWasZ = false; } } return(result); } public Set<String> getRegionFacet(final Record record) { Set<String> result = SolrIndexer.instance().removeTrailingPunct(record, "651a"); Set<String> sub650z = getSet650z(record); for (String fields650 : sub650z) { String parts[] = fields650.split("#"); parts[0] = DataUtil.cleanData(parts[0]); result.add(parts[0]); for (int i = 1; i < parts.length; i++) { if (i == 2) // 650 0$aEthnology$zRussia (Federation)$zSiberia$xResearch$zPoland$xHistory$vCongresses. 0$aLabor movement$zBrazil$zNatal$zDurban. { // things to decide : $z Colombia $z Zipaquira $z South America or $z Germany $z Berlin $z Tiergarten or $z Nicaragua $z Rivas (Dept.)$z Central America or $z Italy $z Sicily $z Camarina (Extinct city) parts[0] = parts[1].replaceAll("((\\p{L}\\p{M}*|\\.|[- ])+(\\p{L}\\p{M}*|\\.))[ ]?\\(((\\p{L}\\p{M}*|\\.|[- ])+).*", "$1, $4"); i = 2; } parts[i] = DataUtil.cleanData(parts[i]); String abbrev = getStateNameAbbrev(parts[0]); if (abbrev != null) { parts[i] = parts[i] + " (" + abbrev + ")"; } else if (parts[i].endsWith(")")) { if (!parts[i].contains("(")) { parts[i] = parts[i].substring(0, parts[i].length()-1) + " (" + parts[0] + ")"; } else if (parts[i].matches(".*[ ]?\\((\\p{L}\\p{M}*|\\.|[- ])+\\)")) { String subparts[] = parts[i].split("[ ]?\\(", 2); if (subparts.length > 1) { subparts[1] = subparts[1].substring(0, subparts[1].length()-1); if (!subparts[1].equals(parts[0]) && isLocationTypeWord(subparts[1])) { parts[i] = subparts[0] + " (" + parts[0] + " : " + subparts[1] + ")"; } else if (!isEqualsOrContains(subparts[1], parts[0])) { parts[i] = parts[i].substring(0, parts[i].length()-1) + ", " + parts[0] + ")"; } //else leave parts[i] as is. else { parts[i] = parts[i]; } } else { parts[i] = parts[i].substring(0, parts[i].length()-1) + ", " + parts[0] + ")"; } } else // things to decide : $z Germany $z Blah (Something : District) or $z Italy $z Satricum (Lazio : Extinct city) { if (parts[i].matches("(\\p{L}\\p{M}*|\\.|[- ])+[ ]?\\((\\p{L}\\p{M}*|\\.|[- ])+ : (\\p{L}\\p{M}*|\\.|[- ])+\\)")) { // equivalent of, but with expanded character sets to include unicode accented letters and accent marks : // parts[i] = parts[i].replaceFirst("([-A-Za-z ]+[A-Za-z])[ ]?\\(([-A-Za-z ]+) : ([-A-Za-z ]+)\\)", // "$1 ($2, "+parts[0]+" : $3)"); parts[i] = parts[i].replaceFirst("((\\p{L}\\p{M}*|\\.|[- ])+(\\p{L}\\p{M}*|\\.))[ ]?\\(((\\p{L}\\p{M}*|\\.|[- ])+) : ((\\p{L}\\p{M}*|\\.|[- ])+)\\)", "$1 ($4, "+parts[0]+" : $6)"); } else parts[i] = parts[i]; } } else { parts[i] = parts[i] + " (" + parts[0] + ")"; } result.add(parts[i]); } } return(result); } }