package me.osm.gazetter.out; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.concurrent.atomic.AtomicInteger; import me.osm.gazetter.join.out_handlers.AddressPerRowJOHBase; import me.osm.gazetter.join.out_handlers.HandlerOptions; import me.osm.gazetter.join.out_handlers.JoinOutHandler; import me.osm.gazetter.striper.FeatureTypes; import me.osm.osmdoc.read.DOCFileReader; import me.osm.osmdoc.read.DOCFolderReader; import me.osm.osmdoc.read.DOCReader; import me.osm.osmdoc.read.OSMDocFacade; import org.apache.commons.lang3.StringUtils; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.supercsv.io.CsvListWriter; import org.supercsv.prefs.CsvPreference; import com.google.code.externalsorting.ExternalSort; public class CSVOutWriter extends AddressPerRowJOHBase { public static final String NAME = "out-csv"; public static Comparator<String> defaultcomparator; private List<List<String>> columns; private FeatureValueExtractor featureEXT = new FeatureValueExctractorImpl(); private FeatureValueExtractor poiEXT; private AddrRowValueExtractor addrRowEXT = new AddrRowValueExctractorImpl(); private Set<String> addrRowKeys = new HashSet<String>(addrRowEXT.getSupportedKeys()); private Set<String> allSupportedKeys = new HashSet<String>(featureEXT.getSupportedKeys()); private OSMDocFacade osmDocFacade; private DOCReader reader; private int uuidColumnIndex = -1; private static final Set<String> OPTIONS = new HashSet<String>( Arrays.asList("out", "columns", "types", "poi-catalog", "header")); private CsvListWriter csvWriter = null; private LinkedHashSet<String> orderedTypes; private String outFile; private static AtomicInteger instances = new AtomicInteger(); private String tmpFile; private List<String> header = null; @Override public JoinOutHandler initialize(HandlerOptions parsedOpts) { if(parsedOpts.has(null)) { initializeWriter(parsedOpts.getString(null, null)); } else { initializeWriter(parsedOpts.getString("out", null)); } this.columns = parseColumns(StringUtils.join(parsedOpts.getList("columns", null), " ")); this.header = parsedOpts.getList("header", null); allSupportedKeys.addAll(addrRowKeys); checkColumnsKeys(); createComparator(); orderedTypes = new LinkedHashSet<>(parsedOpts.getList("types", Arrays.asList("adrpnt", "poipnt"))); initializePOICatalog(parsedOpts); return this; } @Override protected Collection<String> getHandlerArguments( Collection<String> defOptions) { defOptions.addAll(OPTIONS); return defOptions; } @Override protected void initializeWriter(String file) { this.outFile = file; tmpFile = "out-csv" + instances.getAndIncrement() + ".csv.tmp"; super.initializeWriter(tmpFile); csvWriter = new CsvListWriter(writer, CsvPreference.TAB_PREFERENCE); } private void initializePOICatalog(HandlerOptions paresedOpts) { String poiCatalog = paresedOpts.getString("poi-catalog", "jar"); if(poiCatalog.endsWith(".xml") || poiCatalog.equals("jar")) { reader = new DOCFileReader(poiCatalog); } else { reader = new DOCFolderReader(poiCatalog); } osmDocFacade = new OSMDocFacade(reader, null); poiEXT = new PoiValueExctractorImpl(osmDocFacade); } private void createComparator() { int i = 0; for(List<String> bc : this.columns) { for(String key : bc) { if(key.equals("uid")) { uuidColumnIndex = i; } } i++; } if(this.uuidColumnIndex < 0) { defaultcomparator = new Comparator<String>() { @Override public int compare(String r1, String r2) { if(r1 == null && r2 == null) return 0; if(r1 == null || r2 == null) return r1 == null ? -1 : 1; return r1.compareTo(r2); } }; } else { defaultcomparator = new Comparator<String>() { @Override public int compare(String r1, String r2) { if(r1 == null && r2 == null) return 0; if(r1 == null || r2 == null) return r1 == null ? -1 : 1; String uid1 = StringUtils.split(r1, '\t')[uuidColumnIndex]; String uid2 = StringUtils.split(r2, '\t')[uuidColumnIndex]; return uid1.compareTo(uid2); } }; } } private void checkColumnsKeys() { boolean flag = false; for(List<String> c : this.columns) { for(String key : c) { if(featureEXT != null && !featureEXT.supports(key) && poiEXT != null && !poiEXT.supports(key) && addrRowEXT != null && !addrRowEXT.supports(key)) { System.err.println("Column key " + key + " is not supported."); flag = true; } } } if(flag) { System.exit(1); } } private List<List<String>> parseColumns(String columns) { StringTokenizer tokenizer = new StringTokenizer(columns, " ,;[]", true); List<List<String>> result = new ArrayList<>(); boolean inner = false; List<String> innerList = new ArrayList<>(); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if(!" ".equals(token) && !",".equals(token) && !";".equals(token)) { if("[".equals(token)) { inner = true; } else if("]".equals(token)) { inner = false; result.add(innerList); innerList = new ArrayList<>(); } else { if(inner) { innerList.add(token); } else { result.add(Arrays.asList(token)); } } } } if(!innerList.isEmpty()) { result.add(innerList); } return result; } @Override public void handle(JSONObject object, JSONObject address, String stripe) { String ftype = object.getString("ftype"); if(orderedTypes.contains(ftype) && address != null) { if(FeatureTypes.ADMIN_BOUNDARY_FTYPE.equals(ftype) && !StringUtils.contains(stripe, "binx")) { return; } List<Object> row = new ArrayList<>(); Map<String, JSONObject> mapLevels = mapLevels(address); for (List<String> column : columns) { row.add(getColumn(ftype, object, mapLevels, address, column)); } writeCsvRow(row); } } private synchronized void writeCsvRow(List<Object> row) { try { csvWriter.write(row); csvWriter.flush(); } catch (IOException e) { throw new RuntimeException(e); } } private Map<String, JSONObject> mapLevels(JSONObject addrRow) { try { Map<String, JSONObject> result = new HashMap<String, JSONObject>(); JSONArray parts = addrRow.getJSONArray("parts"); for(int i = 0; i < parts.length(); i++) { JSONObject part = parts.getJSONObject(i); result.put(part.getString("lvl"), part); } return result; } catch (JSONException e) { return null; } } private Object getColumn(String ftype, JSONObject jsonObject, Map<String, JSONObject> addrRowLevels, JSONObject addrRow, List<String> column) { for(String key : column) { Object value = null; if(addrRowKeys.contains(key)) { value = addrRowEXT.getValue(key, jsonObject, addrRowLevels, addrRow); } else { if(FeatureTypes.POI_FTYPE.equals(ftype)) { value = poiEXT.getValue(key, jsonObject); } else { value = featureEXT.getValue(key, jsonObject); } } if(value instanceof String) { value = StringUtils.stripToNull((String) value); } if(value != null) { return value; } } return null; } @Override public void allDone() { try { csvWriter.flush(); super.allDone(); List<File> batch = ExternalSort.sortInBatch( new File(tmpFile), defaultcomparator, ExternalSort.DEFAULTMAXTEMPFILES, Charset.forName("utf8"), null, true); ExternalSort.mergeSortedFiles(batch, new File(outFile), defaultcomparator, Charset.forName("utf-8"), true, false, false, getHeaderString()); new File(tmpFile).delete(); } catch (IOException e) { throw new RuntimeException(e); } } private String getHeaderString() { if(this.header == null) { return null; } ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { CsvListWriter csvListWriter = new CsvListWriter(new OutputStreamWriter(baos), CsvPreference.TAB_PREFERENCE); csvListWriter.write(this.header); csvListWriter.flush(); csvListWriter.close(); return baos.toString("utf-8"); } catch (Exception e) { throw new RuntimeException(e); } } }