package com.krickert.ipsearch; import static com.google.common.base.Preconditions.checkNotNull; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.Arrays; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.supercsv.cellprocessor.Optional; import org.supercsv.cellprocessor.ParseDouble; import org.supercsv.cellprocessor.constraint.LMinMax; import org.supercsv.cellprocessor.constraint.StrMinMax; import org.supercsv.cellprocessor.ift.CellProcessor; import org.supercsv.io.CsvBeanReader; import org.supercsv.io.ICsvBeanReader; import org.supercsv.prefs.CsvPreference; import com.krickert.ipsearch.city.IpSearchCityBean; /** * This portion of the application will take in the indexer and start outputting * the data into a concurrent queue for processing by multiple threads into a * single index writer. <br> * This reader will read the data in a Zip format and use the SuperCSV api to * read the data "fast" and put it into a bean for reading <br> * The data is expected to be in the table_full format from the csv offered on * the website. As of the time of this writing this file is over 400 megabytes * long and has over 4047599 entries in them. This means an average of 103.6 * bytes per entry. <br> * The data file is in this format: * * <pre> * "ip_start";"country_code";"country_name";"region_code";"region_name";"city";"zipcode";"latitude";"longitude";"metrocode" * </pre> * * Sample data: * * <pre> * "3523140760";"US";"United States";"17";"Illinois";"Chicago";"60611";"41.9288";"-87.6315";"602" * "3523140848";"US";"United States";"17";"Illinois";"Chicago";"60657";"41.9373";"-87.6551";"602" * </pre> * * <br> * You can find out minimums and maximums by looking at the data below and * matching it to the processor array. <br> * * * @author krickert * */ public class IpDataReaderTask { private static final Log log = LogFactory.getLog(IpDataReaderTask.class); /* the processors were figured out by analyzing the data within */ private static final CellProcessor[] processors = { new LMinMax(0l, 4278190080l), new StrMinMax(2l, 2l), new Optional(new StrMinMax(4l, 50l)), new StrMinMax(0l, 2l), new StrMinMax(0l, 50l), new StrMinMax(0l, 34l), new StrMinMax(0l, 6l), new ParseDouble(), new ParseDouble(), null }; /* * the column mapping file used to reflect between the processors above and * the data in the file. */ private static final String[] columnMapping = { "ipStart", "countryCode", "countryName", "regionCode", "regionName", "city", "zipCode", "lat", "lon", "metroCode" }; public final String zipFileName; public final String fileInZip; public final BlockingQueue<IpSearchCityBean> queue; /** * This is a thread that's meant to be run on a single queue and a single file * per thread. In other words, right now it should only have a single thread * running while multiple other threads are reading from the concurrent queue. * * @param zipFileName * the name of the file that's zipp'ed up from the internet that has * the ip spatial data * @param fileInZip * the path and name of the file in the ZIP file * @param queue * the blocking queue that this thread will send all the parsed data * from the file off to */ public IpDataReaderTask(String zipFileName, String fileInZip, BlockingQueue<IpSearchCityBean> queue) { super(); this.zipFileName = checkNotNull(zipFileName); this.fileInZip = checkNotNull(fileInZip); this.queue = checkNotNull(queue); } public ExecutorService fireAndForget() { ExecutorService executor = Executors.newSingleThreadExecutor(); IpDataReaderThread runner = new IpDataReaderThread(); executor.execute(runner); return executor; } public BlockingQueue<IpSearchCityBean> fire() { IpDataReaderThread runner = new IpDataReaderThread(); runner.run(); return queue; } private class IpDataReaderThread implements Runnable { @Override public void run() { this.queueIpEntries(); } /** * The action function that takes in a file input stream, converts the data * via a CSV parser, and sends it off to a queue for processing by multiple * threads. */ public void queueIpEntries() { InputStream fileStream = null; fileStream = getZipFile(fileStream); ZipInputStream zip = findFileInZip(fileStream); Reader fr = null; ICsvBeanReader inFile = null; IpSearchCityBean previousRow = null; IpSearchCityBean currentRow; try { fr = new InputStreamReader(zip); inFile = new CsvBeanReader(fr, CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE); final String[] header = inFile.getCSVHeader(true); log.info("The following header was parsed: " + Arrays.toString(header)); int counter = 0; while ((currentRow = inFile.read(IpSearchCityBean.class, columnMapping, processors)) != null) { if (counter++ % 50000 == 0 && counter > 0) { log.info((counter - 1) + " number of records parsed."); } if (previousRow != null) { previousRow.setIpEnd(currentRow.getIpStart().longValue() - 1l); queue.put(previousRow); } previousRow = currentRow;// KEANU REEVES: Woahhhh } previousRow.setIpEnd(256l ^ 4l); queue.add(previousRow); } catch (IOException e) { throw new IllegalStateException("The zip file opened but an IO exception was thrown while reading the zip file.", e); } catch (InterruptedException e) { // from the queue offering log.error("queue offering interrupted.", e); } finally { if (inFile != null) { try { inFile.close(); } catch (IOException e) { log.fatal(e); } }// infile null end if (fr != null) { try { fr.close(); } catch (IOException e) { log.warn("failed to close file reader from zip file.", e); } } } log.info("\n*******************\n** IpData all in queue. Terminating process\n**\n******************"); } private ZipInputStream findFileInZip(InputStream fileStream) { ZipInputStream zip = new ZipInputStream(fileStream); try { boolean searchingForFile = true; while (searchingForFile) { ZipEntry entry = zip.getNextEntry(); if (entry == null) { log.error("The zip file is valid but does not match the ${ipsearch.fileinzip} entry from the project.properties file"); zip.close(); throw new IllegalArgumentException("Couldn't find file " + fileInZip + " in zip archive " + zipFileName); } else if (entry.getName().equals(fileInZip)) { searchingForFile = false; } } } catch (IOException e) { log.fatal("zip file appears to be empty.", e); throw new IllegalArgumentException("problem opening up the zip file"); } return zip; } private InputStream getZipFile(InputStream fileStream) { try { fileStream = new FileInputStream(zipFileName); } catch (FileNotFoundException e) { log.fatal("The file we were supposed to download does not exist: [" + zipFileName + "]", e); throw new IllegalStateException("File check occurred after downloading/starting application and is no longer there.", e); } return fileStream; } } }