package me.osm.gazetteer.web.imp; import static me.osm.gazetteer.web.imp.IndexHolder.LOCATION; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.zip.GZIPInputStream; import me.osm.gazetteer.web.ESNodeHolder; import me.osm.gazetteer.web.FeatureTypes; import me.osm.gazetteer.web.GazetteerWeb; import me.osm.gazetteer.web.executions.AbortedException; import me.osm.gazetteer.web.executions.BackgroudTaskDescription; import me.osm.gazetteer.web.executions.BackgroundExecutorFacade.BackgroundExecutableTask; import me.osm.gazetteer.web.utils.OSMDocProperties; import me.osm.gazetteer.web.utils.OSMDocSinglton; import me.osm.gazetteer.web.utils.ReplacersCompiler; import me.osm.osmdoc.localization.L10n; import me.osm.osmdoc.model.Feature; import me.osm.osmdoc.model.Tag.Val; import me.osm.osmdoc.read.OSMDocFacade; import me.osm.osmdoc.read.tagvalueparsers.LogTagsStatisticCollector; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import org.elasticsearch.action.ListenableActionFuture; import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequestBuilder; import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.client.Client; import org.elasticsearch.common.joda.time.LocalDateTime; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.vividsolutions.jts.geom.Coordinate; import com.vividsolutions.jts.geom.GeometryFactory; import com.vividsolutions.jts.geom.LineString; import com.vividsolutions.jts.operation.linemerge.LineMerger; public class LocationsDumpImporter extends BackgroundExecutableTask { private static final LogTagsStatisticCollector POI_STATISTICS = new LogTagsStatisticCollector(); private static final OSMDocFacade FACADE = OSMDocSinglton.get().getFacade(); protected ObjectsWeightBuilder weighter; Logger log = LoggerFactory.getLogger(LocationsDumpImporter.class); protected static final int BATCH_SIZE = 1000; protected Client client; protected BulkRequestBuilder bulkRequest; private String filePath; protected long counter = 0; protected long skipedPoi = 0; protected long skipedByType = 0; private boolean buildingsGeometry; private ListenableActionFuture<BulkResponse> curentBulkRequest; private List<Replacer> hnReplacers = new ArrayList<>(); private List<Replacer> streetsReplacers = new ArrayList<>(); private static final GeometryFactory factory = new GeometryFactory(); private Transliterator transliterator = null; private Set<String> skip; private String callback; private String region; private HashSet<String> skipPoiTypes; public void setCallback(String callback) { this.callback = callback; }; @Override public String getCallbackURL() { return callback; } private static class EmptyAddressException extends Exception { private static final long serialVersionUID = 8178453133841622471L; } public LocationsDumpImporter(String source, boolean buildingsGeometry) { String trClass = GazetteerWeb.config().getTransliteratorClass(); try { this.transliterator = (Transliterator) Class.forName(trClass).newInstance(); } catch (Exception e) { log.warn("Couldn't initialize transliterator {}", trClass); this.transliterator = new ApacheASCIIFoldTransliterator(); } this.buildingsGeometry = buildingsGeometry; this.filePath = source; weighter = new DefaultWeightBuilder(); ReplacersCompiler.compile(hnReplacers, new File("config/replacers/index/hnIndexReplasers")); ReplacersCompiler.compile(streetsReplacers, new File("config/replacers/index/streetsReplacers")); this.skip = new HashSet<>(GazetteerWeb.config().getImportSkipTypes()); this.skipPoiTypes = new HashSet<String>(); fillSkipPoiTypes(); } private void fillSkipPoiTypes() { OSMDocProperties osmdocProperties = GazetteerWeb.osmdocProperties(); for(String branch : osmdocProperties.getIgnoreBranches()) { Collection<? extends Feature> hierarcyBranch = OSMDocSinglton.get().getReader().getHierarcyBranch( osmdocProperties.getImportDefaultHierarchy(), branch); for(Feature f : hierarcyBranch) { this.skipPoiTypes.add(f.getName()); } } this.skipPoiTypes.addAll(osmdocProperties.getIgnoreTypes()); } public static InputStream getFileIS(String osmFilePath) throws IOException, FileNotFoundException { InputStream fileIS = null; if(osmFilePath.startsWith("http")) { fileIS = new URL(osmFilePath).openStream(); } else { fileIS = new FileInputStream(osmFilePath); } if (osmFilePath.endsWith("gz")) { return new GZIPInputStream(fileIS); } if (osmFilePath.endsWith("bz2")) { return new BZip2CompressorInputStream(fileIS); } return fileIS; } @Override public void executeTask() throws AbortedException { client = ESNodeHolder.getClient(); bulkRequest = client.prepareBulk(); IndicesExistsResponse response = new IndicesExistsRequestBuilder( client.admin().indices()).setIndices("gazetteer").execute() .actionGet(); if (!response.isExists()) { IndexHolder.createIndex(); } InputStream fileIS = null; try { fileIS = getFileIS(filePath); this.region = getRegionName(filePath); BufferedReader reader = new BufferedReader(new InputStreamReader(fileIS, "UTF8")); String line = reader.readLine(); while (line != null) { addRequestToBatch(line); line = reader.readLine(); } if(bulkRequest.numberOfActions() > 0) { executeBulk(); } log.info("Import done. {} rows imported.", counter); } catch (AbortedException aborted) { log.info("Import was interrupted. {} rows imported.", counter); throw aborted; } catch (Exception e) { throw new AbortedException("Import aborted. Root error msg: " + ExceptionUtils.getRootCauseMessage(e), e, false); } finally { IOUtils.closeQuietly(fileIS); } } private String getRegionName(String filepath) { String[] parts = StringUtils.split(filepath, "/\\"); if(parts.length > 0) { String last = parts[parts.length - 1]; String name = StringUtils.remove(last, ".json"); name = StringUtils.remove(name, ".gz"); name = name.toLowerCase(); return name; } return null; } protected void addRequestToBatch(String line) throws AbortedException { if(line != null) { if(bulkRequest == null) { bulkRequest = client.prepareBulk(); } createRequestAndAdd(line); if(counter % BATCH_SIZE == 0) { executeBulk(); if(isAborted()) { throw new AbortedException(null, null, true); } log.info("{} rows imported", NumberFormat.getNumberInstance().format(counter)); bulkRequest = client.prepareBulk(); } } } protected void createRequestAndAdd(String line) { line = processLine(line); if (line != null) { IndexRequestBuilder ind = indexRequest(line); bulkRequest.add(ind.request()); counter++; } } protected IndexRequestBuilder indexRequest(String line) { IndexRequestBuilder ind = new IndexRequestBuilder(client) .setSource(line).setIndex("gazetteer").setType(LOCATION); return ind; } protected void executeBulk() { if(curentBulkRequest != null && !curentBulkRequest.isDone()) { BulkResponse bulkResponse = curentBulkRequest.actionGet(); if (bulkResponse.hasFailures()) { log.error(bulkResponse.buildFailureMessage()); } } if(bulkRequest.numberOfActions() > 0) { curentBulkRequest = bulkRequest.execute(); } } protected String processLine(String line) { try { JSONObject obj = new JSONObject(line); if(doSkip(obj)) { return null; } if(!buildingsGeometry) { obj = filterFullGeometry(obj); } obj = mergeHighwayNetsGeometry(obj); filterAddrPartsNames(obj); try { String searchText = getSearchText(obj); searchText = sanitizeSearchText(searchText); obj.put("search", searchText); } catch (EmptyAddressException e) { return null; } if("poipnt".equals(obj.optString("type"))) { obj.put("poi_class_trans", new JSONArray(getPoiTypesTranslated(obj))); List<Feature> poiClassess = listPoiClassesOSMDoc(obj); Map<String, List<Val>> moreTagsVals = new HashMap<String, List<Val>>(); JSONObject moreTags = FACADE.parseMoreTags(poiClassess, obj.getJSONObject("tags"), POI_STATISTICS, moreTagsVals); obj.put("more_tags", moreTags); LinkedHashSet<String> keywords = new LinkedHashSet<String>(); FACADE.collectKeywords(poiClassess, moreTagsVals, keywords, null); obj.put("poi_keywords", new JSONArray(keywords)); } obj.remove("alt_addresses"); obj.remove("alt_addresses_trans"); obj.remove("hhash"); if(obj.has("housenumber")) { obj.put("housenumber_exact", obj.optString("housenumber").toLowerCase()); obj.put("housenumber_main", getMainHousenumber(obj.optString("housenumber"))); obj.put("housenumber", new JSONArray(fuzzyHousenumberIndex(obj.optString("housenumber")))); } obj.put("weight", weighter.weight(obj)); fillImported(obj); return obj.toString(); } catch (JSONException e) { log.error("Failed to parse: " + line); return null; } } private void fillImported(JSONObject obj) { JSONObject imported = new JSONObject(); imported.put("region", this.region); imported.put("imp_ts", new LocalDateTime().toDateTime().toString()); Object genTS = obj.opt("timestamp"); if(genTS != null) { imported.put("gen_ts", genTS); obj.remove("timestamp"); } obj.put("_imported", imported); } private String getMainHousenumber(String optString) { String lowerCase = optString.toLowerCase().trim(); int l = 0; for(char c : lowerCase.toCharArray()) { if(c < '0' || c > '9') { break; } l++; } return lowerCase.substring(0, l); } private JSONObject mergeHighwayNetsGeometry(JSONObject jsonObject) { if(jsonObject.getString("type").equals(FeatureTypes.HIGHWAY_NET_FEATURE_TYPE)) { JSONArray geometriesArray = jsonObject.optJSONArray("geometries"); if(geometriesArray != null) { List<LineString> lss = new ArrayList<>(); for(int i = 0; i < geometriesArray.length(); i++ ) { JSONObject geom = geometriesArray.getJSONObject(i); lss.add(getLineStringGeometry(geom.getJSONArray("coordinates"))); } LineMerger merger = new LineMerger(); merger.add(lss); @SuppressWarnings("unchecked") Collection<LineString> merged = (Collection<LineString>)merger.getMergedLineStrings(); jsonObject.remove("geometries"); jsonObject.getJSONObject("center_point").remove("type"); jsonObject.put("full_geometry", writeMultiLineString(merged)); } } return jsonObject; } private boolean doSkip(JSONObject obj) { if(this.skip.contains(obj.getString("type"))) { skipedByType++; return true; } if("poipnt".equals(obj.getString("type"))) { Object poiClass = obj.opt("poi_class"); if(poiClass instanceof JSONArray ) { JSONArray clazzA = (JSONArray)poiClass; Set<String> asStrings = new HashSet<>(); for(int i = 0; i < clazzA.length(); i++) { asStrings.add(clazzA.getString(i)); } if(skipPoiTypes.containsAll(asStrings)) { skipedPoi++; return true; } } else if(poiClass instanceof String ) { if(skipPoiTypes.contains(poiClass)) { skipedPoi++; return true; } } } return false; } private void filterAddrPartsNames(JSONObject obj) { JSONObject addr = obj.optJSONObject("address"); if(addr != null) { JSONArray parts = addr.optJSONArray("parts"); if(parts != null) { for(int i=0; i < parts.length();i++) { JSONObject part = parts.getJSONObject(i); if(part != null) { part.remove("names"); } } } } } public List<String> fuzzyHousenumberIndex(String optString) { LinkedHashSet<String> result = new LinkedHashSet<>(); if(StringUtils.isNotBlank(optString)) { result.add(optString); result.addAll(transformHousenumbers(optString)); } return new ArrayList<>(result); } private Collection<String> transformHousenumbers(String optString) { return transform(optString, hnReplacers); } private Collection<String> transformStreets(String optString) { Collection<String> s = transform(optString, streetsReplacers); s.add(optString); return s; } private Collection<String> transform(String optString, Collection<Replacer> replacers) { Set<String> result = new HashSet<>(); for(Replacer replacer : replacers) { try { Collection<String> replace = replacer.replace(optString); if(replace != null) { for(String s : replace) { if(StringUtils.isNotBlank(s) && !"null".equals(s)) { result.add(s); } } } } catch (Exception e) { } } return result; } private String sanitizeSearchText(String shortText) { shortText = shortText.toLowerCase(); shortText = StringUtils.remove(shortText, ","); shortText = StringUtils.replace(shortText, "-", " "); shortText += addTransliteration(shortText); return shortText; } private String addTransliteration(String text) { StringBuilder sb = new StringBuilder(); for(String term : StringUtils.split(text, GazetteerWeb.config().getQueryAnalyzerSeparators())) { String translit = transliterator.transliterate(term); if(!term.equals(translit)) { sb.append(" ").append(translit); } } return sb.toString(); } private String getSearchText(JSONObject obj) throws EmptyAddressException { StringBuilder sb = new StringBuilder(); JSONObject addrobj = obj.optJSONObject("address"); if(addrobj != null) { String addrText = null; JSONArray jsonArray = addrobj.optJSONArray("parts"); if(jsonArray == null) { addrText = addrobj.optString("longText"); } else { LinkedHashMap<String, JSONObject> addrLevels = new LinkedHashMap<>(10); for(int i = 0; i < jsonArray.length(); i++ ) { JSONObject addrPart = jsonArray.getJSONObject(i); addrLevels.put(addrPart.getString("lvl"), addrPart); } addrText = getAddrText(obj, addrobj, addrLevels); } if(!"admbnd".equals(obj.optString("type")) && !"plcpnt".equals(obj.optString("type"))) { String streetName = getStreetName(obj, addrobj); if(streetName != null) { Collection<String> transformStreets = transformStreets(streetName); JSONArray streetAltNames = obj.optJSONArray("street_alternate_names"); if(streetAltNames == null) { streetAltNames = new JSONArray(); } for(String alt : transformStreets) { obj.put("street_name_var", alt); } } } if(StringUtils.isNotBlank(addrText)) { sb.append(addrText); if(!obj.has("locality_name") && obj.has("nearest_place")) { sb.append(" ").append(obj.getJSONObject("nearest_place").getString("name")); } } else { throw new EmptyAddressException(); } } if("poipnt".equals(obj.optString("type"))) { List<String> titles = getPoiTypesTranslated(obj); for(String s : titles) { sb.append(" ").append(s); } String name = obj.optString("name"); sb.append(" ").append(name); JSONObject tags = obj.optJSONObject("tags"); if(tags != null) { concatTagValue(sb, tags, "operator"); concatTagValue(sb, tags, "brand"); concatTagValue(sb, tags, "network"); concatTagValue(sb, tags, "ref"); concatTagValue(sb, tags, "branch"); } } return StringUtils.remove(sb.toString(), ','); } private String getStreetName(JSONObject obj, JSONObject addrobj) { if("hghnet".equals(obj.optString("type")) || "hghway".equals(obj.optString("type")) ) { return obj.optString("name"); } JSONArray jsonArray = addrobj.optJSONArray("parts"); if(jsonArray != null) { for(int i = 0; i < jsonArray.length(); i++ ) { JSONObject addrPart = jsonArray.getJSONObject(i); if("street".equals(addrPart.getString("lvl"))) { return addrPart.getString("name"); } } } return null; } private String getAddrText(JSONObject obj, JSONObject addrobj, LinkedHashMap<String, JSONObject> addrLevels) { StringBuilder sb = new StringBuilder(); for(Entry<String, JSONObject> entry : addrLevels.entrySet()) { JSONObject part = entry.getValue(); String lvlText = getLvlText(part, entry.getValue()); if(StringUtils.isNotBlank(lvlText)) { sb.append(" ").append(lvlText); } } return sb.toString(); } private String getLvlText(JSONObject part, JSONObject value) { String name = part.getString("name"); if("street".equals(part.getString("lvl"))) { return StringUtils.join(transformStreets(name), " ").toLowerCase(); } return name; } private void concatTagValue(StringBuilder sb, JSONObject tags, String tag) { String tv = StringUtils.stripToNull(tags.optString(tag)); if(tv != null) { for(String s : StringUtils.split(tv, ";")) { sb.append(" ").append(s); } } } private List<String> getPoiTypesTranslated(JSONObject obj) { List<String> result = new ArrayList<String>(1); List<Feature> classes = listPoiClassesOSMDoc(obj); for(Feature f : classes) { for(String ln : L10n.supported) { String translatedTitle = FACADE.getTranslatedTitle(f, Locale.forLanguageTag(ln)); result.add(translatedTitle); } } return result; } private List<Feature> listPoiClassesOSMDoc(JSONObject obj) { JSONArray poiClasses = obj.getJSONArray("poi_class"); List<Feature> classes = new ArrayList<Feature>(); for(int i = 0; i < poiClasses.length(); i++) { String classCode = poiClasses.getString(i); Feature poiClass = FACADE.getFeature(classCode); if(poiClass != null) { classes.add(poiClass); } else { log.warn("Couldn't find poi class for code {}", classCode); } } return classes; } private JSONObject filterFullGeometry(JSONObject jsonObject) { if(jsonObject.getString("type").equals(FeatureTypes.ADDR_POINT_FTYPE) || jsonObject.getString("type").equals(FeatureTypes.POI_FTYPE)) { jsonObject.remove("full_geometry"); } return jsonObject; } private JSONObject writeMultiLineString(Collection<LineString> merged) { JSONObject result = new JSONObject(); result.put("type", "multilinestring"); JSONArray coords = new JSONArray(); result.put("coordinates", coords); for(LineString ls : merged) { coords.put(lineStringCoords(ls)); } return result; } private JSONArray lineStringCoords(LineString ls) { JSONArray result = new JSONArray(); Coordinate[] coordinates = ls.getCoordinates(); for(Coordinate c : coordinates) { result.put(new double[]{c.x, c.y}); } return result; } public static LineString getLineStringGeometry(JSONArray coordsJSON) { Coordinate[] coords = new Coordinate[coordsJSON.length()]; for(int i = 0; i < coordsJSON.length(); i++) { JSONArray p = coordsJSON.getJSONArray(i); coords[i] = new Coordinate(p.getDouble(0), p.getDouble(1)); } return factory.createLineString(coords); } public List<Replacer> getReplacers() { return hnReplacers; } @Override public BackgroudTaskDescription description() { BackgroudTaskDescription description = new BackgroudTaskDescription(); description.setId(this.getId()); description.setUuid(this.getUUID()); description.setClassName(getClass().getName()); Map<String, Object> parameters = new HashMap<String, Object>(); description.setClassName(getClass().getName()); description.setParameters(parameters); parameters.put("source", filePath); parameters.put("skip", new HashSet<>(skip)); parameters.put("skipPoiTypes", new HashSet<>(skipPoiTypes)); // Not synchronized, we don't need 100% correct data here parameters.put("skipPoiTypes", skipedPoi); parameters.put("skipedByType", skipedByType); parameters.put("callback", callback); return description; } }