package me.test.jdk.java.util; import java.io.BufferedInputStream; import java.io.Closeable; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.CharBuffer; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.LinkedHashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * java.util.Scanner 有一些不方便的地方。参考其实现追加了以下需要的方法: * {@link #skipBuffer() skipBuffer()}、 * {@link #findAtBeginning(Pattern) findAtBeginning(Pattern)}、 * {@link #findWithinHorizon(Pattern) findWithinHorizon(Pattern)}、 * {@link #hasMore() hasMore()}、 */ public class MyScanner implements Closeable { public static final Pattern LINE_PATTERN = Pattern.compile("\r\n|[\n\r\u2028\u2029\u0085]"); // Boolean indicating if a match result is available private boolean sourceClosed = false; private CharBuffer buf = null; private Readable source; // A holder of the last IOException encountered private IOException lastException; // Boolean indicating more input is required private boolean needInput = false; // Internal matcher used for finding delimiters private Matcher matcher; private boolean closed = false; // The index into the buffer currently held by the Scanner private int position = 0; public MyScanner(Readable source) { this(source, 1024); } public MyScanner(Readable source, int bufSize) { this.source = source; buf = CharBuffer.allocate(bufSize); buf.limit(0); matcher = LINE_PATTERN.matcher(buf); matcher.useTransparentBounds(true); matcher.useAnchoringBounds(false); } private void ensureOpen() { if (closed) { throw new IllegalStateException("Scanner closed"); } } // 判断在buffer中是否包含指定的模式。 public String findWithinHorizon(Pattern pattern, int horizon) { ensureOpen(); if (pattern == null) { throw new NullPointerException(); } if (horizon < 0) { throw new IllegalArgumentException("horizon < 0"); } while (true) { String token = findPatternInBuffer(pattern, horizon); if (token != null) { return token; } if (needInput) { readInput(); } else { break; // up to end of input } } return null; } private String findPatternInBuffer(Pattern pattern, int horizon) { matcher.usePattern(pattern); int bufferLimit = buf.limit(); int horizonLimit = -1; int searchLimit = bufferLimit; if (horizon > 0) { horizonLimit = position + horizon; if (horizonLimit < bufferLimit) { searchLimit = horizonLimit; } } matcher.region(position, searchLimit); if (matcher.find()) { if (matcher.hitEnd() && (!sourceClosed)) { // The match may be longer if didn't hit horizon or real end if (searchLimit != horizonLimit) { // Hit an artificial end; try to extend the match needInput = true; return null; } // The match could go away depending on what is next if ((searchLimit == horizonLimit) && matcher.requireEnd()) { // Rare case: we hit the end of input and it happens // that it is at the horizon and the end of input is // required for the match. needInput = true; return null; } } // Did not hit end, or hit real end, or hit horizon position = matcher.end(); return matcher.group(); } if (sourceClosed) { return null; } // If there is no specified horizon, or if we have not searched // to the specified horizon yet, get more input if ((horizon == 0) || (searchLimit != horizonLimit)) { needInput = true; } return null; } private boolean hasMoreInBuffer() { // System.out.println("=== buf.position/limit/capacity = " // + buf.position() + "/" // + buf.limit() + "/" // + buf.capacity()); // matcher.region(position, buf.limit()); // If we are sitting at the end, no more tokens in buffer if (position == buf.limit()) { return false; } return true; } public void close() throws IOException { if (closed) return; if (source instanceof Closeable) { try { ((Closeable) source).close(); } catch (IOException ioe) { lastException = ioe; } } sourceClosed = true; source = null; closed = true; } public boolean isClosed() { return sourceClosed || closed; } // ---------------------------------------------------------------------------------------- private String getCompleteTokenInBuffer(Pattern pattern) { // Attempt to match against the desired pattern matcher.usePattern(pattern); matcher.region(position, buf.limit()); if (matcher.lookingAt()) { if (position > 0 && matcher.hitEnd() && (!sourceClosed)) { needInput = true; return null; } position = matcher.end(); return matcher.group(); } if (position > 0 && !sourceClosed) { needInput = true; return null; } return null; } // // 最后一次匹配结果在buffer中的开始位置。 // public int begin() { // return matcher.start(); // } // // 最后一次匹配结果在buffer中的结束位置。 // public int end() { // return matcher.end(); // } // // 跳到最后一次匹配结果在buffer中的结束位置。 // public void skipLastMatch() { // buf.position(end()); // } /** * 跳过buffer中的剩余的字符串,并跳过的字符串返回。 */ public String skipBuffer() { String remain = buf.toString(); position = buf.limit(); buf.position(buf.limit()); return remain; } // public void position(int newPosition) { // buf.position(newPosition); // } private void readInput() { if (buf.limit() == buf.capacity()) { makeSpace(); } // Prepare to receive data int p = buf.position(); buf.position(buf.limit()); buf.limit(buf.capacity()); int n = 0; try { n = source.read(buf); } catch (IOException ioe) { lastException = ioe; n = -1; } if (n == -1) { sourceClosed = true; needInput = false; } if (n > 0) { needInput = false; } // Restore current position and limit for reading buf.limit(buf.position()); buf.position(p); } // After this method is called there will either be an exception // or else there will be space in the buffer private void makeSpace() { buf.position(position); // Gain space by compacting buffer if (buf.position() > 0) { buf.compact(); buf.flip(); position = 0; return; } System.out.println("================================double space : " + buf.capacity()); // Gain space by growing buffer int newSize = buf.capacity() * 2; CharBuffer newBuf = CharBuffer.allocate(newSize); newBuf.put(buf); newBuf.flip(); buf = newBuf; position = 0; matcher.reset(buf); } // 判断当前位置buffer中是否以指定模式开始。 public String findAtBeginning(Pattern pattern) { ensureOpen(); if (pattern == null) { throw new NullPointerException(); } // Search for the pattern while (true) { String token = getCompleteTokenInBuffer(pattern); if (token != null) { return token; } if (needInput) { readInput(); } else { // throwFor(); return null; } } } // 判断在buffer中是否包含指定的模式。 public String findWithinHorizon(Pattern pattern) { return findWithinHorizon(pattern, buf.capacity()); } public boolean hasMore() { ensureOpen(); while (!sourceClosed) { if (hasMoreInBuffer()) { return true; } readInput(); } return hasMoreInBuffer(); } // TEST 1 : count line public static void main1(String[] args) throws IOException { StringBuilder b = new StringBuilder(); for (int i = 'a'; i <= 'z'; i++) { for (int j = 0; j < 100; j++) { b.append((char) i); } b.append("\n"); } System.out.println(b); int line = 0; // X x = new X(new CharArrayReader(b.toString().toCharArray()), 64); MyScanner x = new MyScanner(new InputStreamReader(fromLargeFile()), 2048); try { while (x.hasMore()) { if (x.findWithinHorizon(LINE_PATTERN) != null) { line++; System.out.println(line); } else { x.skipBuffer(); } } } finally { x.close(); System.out.println("line count : " + line); } } // TEST 2 : count line && search target line public static void main2(String[] args) throws IOException { StringBuilder b = new StringBuilder(); for (int i = 'a'; i <= 'z'; i++) { for (int j = 0; j < 100; j++) { b.append((char) i); } b.append("\n"); } System.out.println(b); int line = 0; // X x = new X(new CharArrayReader(b.toString().toCharArray()), 100); // Pattern prefix = Pattern.compile("o+"); MyScanner x = new MyScanner(new InputStreamReader(fromLargeFile()), 2048); Pattern prefix = Pattern.compile("INSERT INTO `trade` VALUES "); int atLine = 0; try { while (x.hasMore()) { String s = null; if ((s = x.findAtBeginning(prefix)) != null) { System.out.println("s = " + s); atLine = line + 1; } if (x.findWithinHorizon(LINE_PATTERN) != null) { line++; System.out.println(line); } else { x.skipBuffer(); } } } finally { x.close(); System.out.println("atLine/line : " + atLine + "/" + line); } } public static void main(String[] args) throws IOException { Pattern prefix = Pattern.compile("INSERT INTO `trade` VALUES "); Pattern colPattern = Pattern.compile("(NULL|'(\\\\'|[^'])*'|[\\+-]?\\d+(\\.?\\d*)?),?"); MyScanner x = new MyScanner(new InputStreamReader(fromLargeFile()), 2048); int line = 0; int atLine = 0; int count = 0; long latestTime = 0; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); long end = 0; try { while (x.hasMore()) { String p = null; // 1. 查找行前缀 if ((p = x.findAtBeginning(prefix)) != null) { atLine = line + 1; while ((p = x.findAtBeginning(Pattern.compile("\\("))) != null) { count++; // 1.1 查找记录 Map<String, String> rec = new LinkedHashMap<String, String>(); for (String colName : cols) { String colValue = x.findAtBeginning(colPattern); rec.put(colName, colValue); } x.findAtBeginning(Pattern.compile("\\)[,;]")); System.out.println("REC : " + count + ". " + rec); String dateCreated = rec.get("date_created"); if (dateCreated != null) { if (dateCreated.endsWith(",")) { dateCreated = dateCreated.substring(0, dateCreated.length() - 1); } dateCreated = dateCreated.replace("'", ""); try { long time = sdf.parse(dateCreated).getTime(); if (latestTime < time) { latestTime = time; } } catch (ParseException e) { e.printStackTrace(); } } } } if ((p = x.findWithinHorizon(LINE_PATTERN)) != null) { line++; System.out.println(line); } else { p = x.skipBuffer(); // System.out.println("REMAIN:" + p); if (x.isClosed()) { line++; } } } } finally { x.close(); end = System.currentTimeMillis(); System.out.println("time = " + ((end - start) / 1000) + " second"); System.out.println("atLine/line : " + atLine + "/" + line); System.out.println("count = " + count); System.out.println("latestTime = " + sdf.format(new Date(latestTime))); System.out.println("buf.capacity = " + x.buf.capacity()); } } static InputStream fromLargeFile() throws FileNotFoundException { final String dumpFile = "/home/zll/tmp/mysql_dump_140928.sql"; return new BufferedInputStream(new FileInputStream(dumpFile)); } public final static String[] cols = new String[] { "id", "version", "adjust_price", "area", "buyer_id", "buyer_memo", "cart_id", "channel", "city", "date_created", "expect_send_date", "last_updated", "pay_time", "postage", "province", "receive_date", "receiver", "seller_memo", "send_date", "status", "telphone", "total_price", "urge_count", "weight", "zip_code", "coupon_id", "pindan_id", "gift_id", "invoice_content", "invoice_name", "logistics_id", "logistics_type", "postage_fee", "street", "man_song_detail_id", "man_song_sended", "is_virtual", "adjust_custom_price", "pay_price", "pay_type", "integral", "iphone_code", "grade", "is_free_postage", "parent_id", "source", "full_re_activity", "actual_price", "box_memo", "neibu_channel", "integral_change", "is_delete", "used_balance" }; }