package com.brianway.webporter.data; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.thread.CountableThreadPool; import java.util.ArrayList; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; /** * Created by brian on 16/12/8. * 数据处理框架的核心组装类 * 用于拼接输入,处理逻辑和输出,进行线程设置 */ public class BaseAssembler<IN, OUT> { private static final Logger logger = LoggerFactory.getLogger(BaseAssembler.class); protected int threadNum = 1; protected RawInput<IN> rawInput; protected DataProcessor<IN, OUT> dataProcessor; protected List<OutPipeline<OUT>> outPipelines = new ArrayList<>(); // protected ExecutorService executorService; protected CountableThreadPool threadPool; protected AtomicLong outItemCount = new AtomicLong(0); protected AtomicInteger stat = new AtomicInteger(STAT_INIT); protected final static int STAT_INIT = 0; protected final static int STAT_RUNNING = 1; protected final static int STAT_STOPPED = 2; private final AtomicLong inItemCount = new AtomicLong(0); /** * 工厂方法 * * @param rawInput 原始输入 * @param dataProcessor 数据处理的类 * @param <IN> 输入队列的类型参数 * @param <OUT> 输出数据的类型参数 * @return 组装类的实例 */ public static <IN, OUT> BaseAssembler<IN, OUT> create( RawInput<IN> rawInput, DataProcessor<IN, OUT> dataProcessor) { return new BaseAssembler<>(rawInput, dataProcessor); } public BaseAssembler(RawInput<IN> rawInput, DataProcessor<IN, OUT> dataProcessor) { this.rawInput = rawInput; this.dataProcessor = dataProcessor; } protected void initComponent() { if (rawInput == null) { throw new RuntimeException("must set input"); } if (threadPool == null || threadPool.isShutdown()) { threadPool = new CountableThreadPool(threadNum); } if (outPipelines.isEmpty()) { outPipelines.add(new ConsoleOutpipeline<>()); } } public void run() { long startTime = System.currentTimeMillis(); checkRunningStat(); initComponent(); while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { final IN inItem = rawInput.poll(); if (inItem == null) { if (threadPool.getThreadAlive() == 0) { break; } } else { threadPool.execute(() -> { try { processInItem(inItem); } catch (Exception e) { logger.error("error: " + inItem, e); } finally { inItemCount.incrementAndGet(); } }); } } stat.set(STAT_STOPPED); long endTime = System.currentTimeMillis(); logger.info("Process end. spent {} ms", (endTime - startTime)); // release some resources close(); endTime = System.currentTimeMillis(); logger.info("Total time: {} ms", endTime - startTime); logger.info("Total outItemCount: {}", outItemCount); } protected void processInItem(IN inItem) { List<OUT> outItems = dataProcessor.process(inItem); if (outItems == null || outItems.isEmpty()) { return; } outItemCount.addAndGet(outItems.size()); outPipelines.forEach(outPipeline -> outPipeline.process(outItems)); } private void checkRunningStat() { while (true) { int statNow = stat.get(); if (statNow == STAT_RUNNING) { throw new IllegalStateException("Assembler is already running!"); } if (stat.compareAndSet(statNow, STAT_RUNNING)) { break; } } } protected void checkIfRunning() { if (stat.get() == STAT_RUNNING) { throw new IllegalStateException("Assembler is already running!"); } } public void close() { destroyEach(dataProcessor); outPipelines.forEach(this::destroyEach); threadPool.shutdown(); } private void destroyEach(Object object) { if (object instanceof AutoCloseable) { try { ((AutoCloseable) object).close(); } catch (Exception e) { logger.warn("destroyEach: {}", e); } } } public BaseAssembler<IN, OUT> thread(int threadNum) { this.threadNum = threadNum; return this; } public BaseAssembler<IN, OUT> setOutPipelines(List<OutPipeline<OUT>> outPipelines) { checkIfRunning(); this.outPipelines = outPipelines; return this; } public BaseAssembler<IN, OUT> addOutPipeline(OutPipeline<OUT> outPipeline) { checkIfRunning(); this.outPipelines.add(outPipeline); return this; } public static void main(String[] args) { String folder = "/Users/brian/Desktop/zhihu/20161124/www.zhihu.com"; OutPipeline<String> outPipeline = new ConsoleOutpipeline<>(); new Thread(() -> { BaseAssembler.create( new FileRawInput(folder), new DemoDataProcessor()) .addOutPipeline(outPipeline) .thread(10) .run(); }).start(); } }