package com.brianway.webporter.collector.zhihu.processor;
import com.brianway.webporter.collector.zhihu.ZhihuConfiguration;
import com.brianway.webporter.data.BaseAssembler;
import com.brianway.webporter.data.ConsoleOutpipeline;
import com.brianway.webporter.data.DataProcessor;
import com.brianway.webporter.data.FileRawInput;
import com.brianway.webporter.data.HashSetDuplicateRemover;
import com.brianway.webporter.data.elasticsearch.Document;
import com.brianway.webporter.util.FileHelper;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Json;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* 从原始数据生成满足 Elasticsearch 格式的 json 数据
*/
public class ZhihuMemberDataProcessor implements DataProcessor<File, Document> {
private HashSetDuplicateRemover<String> duplicateRemover = new HashSetDuplicateRemover<>();
@Override
public List<Document> process(File inItem) {
String s = readMember(inItem);
List<Document> documents = null;
if (!StringUtils.isEmpty(s)) {
documents = new ArrayList<>(1);
Json json = new Json(s);
String id = json.jsonPath("$.id").get();
if (!duplicateRemover.isDuplicate(id)) {
documents.add(new Document(id, s));
}
}
return documents;
}
public static String readMember(File inItem) {
List<String> followees = FileHelper.processFile(inItem, br -> {
br.readLine();//pass first line
String s = br.readLine();
return Collections.singletonList(s);
}).orElse(new ArrayList<>());
return followees.size() == 0 ? null : followees.get(0);
}
public static void main(String[] args) {
ZhihuConfiguration configuration = new ZhihuConfiguration();
String folder = configuration.getMemberDataPath();
DataProcessor<File, Document> processor = new ZhihuMemberDataProcessor();
ConsoleOutpipeline<Document> outPipeline = new ConsoleOutpipeline<>();
BaseAssembler.create(new FileRawInput(folder), processor)
.addOutPipeline(i -> {
}) // 需要打印时替换为 outPipeline
.thread(10)
.run();
System.out.println("out sent :" + outPipeline.getCount());
}
}