/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.BodyContentHandler; import org.apdplat.superword.model.Word; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * 将pdf文档转换为txt文档 * @author 杨尚川 */ public class PdfParser { private PdfParser(){} private static final Logger LOGGER = LoggerFactory.getLogger(PdfParser.class); private static final AutoDetectParser PARSER = new AutoDetectParser(); private static final int SENTENCE_WORD_MIN_COUNT = 10; private static final int MAX_WORD_CHAR_COUNT = 18; private static final float SENTENCE_CAP_WORD_MAX_RATE = 0.4f; private static final Set<String> punctuation = new HashSet<>(); private static final Set<Character> CORRUPT_CHAR = new HashSet<>(); private static final Set<Word> DICTIONARY = WordSources.getAll(); static { punctuation.add(","); punctuation.add("’"); punctuation.add("‐"); punctuation.add("‑"); punctuation.add("‒"); punctuation.add("–"); punctuation.add("—"); punctuation.add("-"); punctuation.add("―"); punctuation.add(":"); punctuation.add(";"); punctuation.add("/"); punctuation.add("+"); punctuation.add("="); punctuation.add("=="); punctuation.add("%"); punctuation.add("!"); punctuation.add("'"); punctuation.add("\""); punctuation.add("["); punctuation.add("]"); punctuation.add("("); punctuation.add(")"); punctuation.add("“"); punctuation.add("”"); punctuation.add("?"); } private static final Map<Integer, AtomicInteger> SENTENCE_LENGTH_INFO = new ConcurrentHashMap<>(); /** * 将PDF文件解析为文本 * @param file 本地PDF文件的相对路径或绝对路径 * @return 提取的文本 */ public static String parsePdfFileToPlainText(String file) { try(InputStream stream = new FileInputStream(file)) { BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); PARSER.parse(stream, handler, metadata); return handler.toString(); } catch (Exception e){ e.printStackTrace(); } return ""; } public static void parseDirectory(String dir){ parseDirectory(Paths.get(dir)); } public static void parseDirectory(Path dir){ try { long start = System.currentTimeMillis(); LOGGER.info("开始处理目录:" + dir); List<String> fileNames = new ArrayList<>(); Files.walkFileTree(dir, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { String fileName = parseFile(file); if(StringUtils.isNotBlank(fileName)) { fileNames.add(fileName); } return FileVisitResult.CONTINUE; } }); Files.write(Paths.get("src/main/resources/it/manifest"), fileNames); long cost=System.currentTimeMillis()-start; LOGGER.info("处理完毕,耗时:"+cost+"毫秒"); }catch (IOException e){ e.printStackTrace(); } } public static void parseZip(String zipFile){ long start = System.currentTimeMillis(); LOGGER.info("开始解析ZIP文件:"+zipFile); try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) { for(Path path : fs.getRootDirectories()){ LOGGER.info("处理目录:"+path); Files.walkFileTree(path, new SimpleFileVisitor<Path>(){ @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { LOGGER.info("处理文件:"+file); // 拷贝到本地文件系统 Path temp = Paths.get("target/it-software-domain-temp.pdf"); Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING); parseFile(temp.toFile().getAbsolutePath()); return FileVisitResult.CONTINUE; } }); } }catch (Exception e){ LOGGER.error("解析文本出错", e); } long cost=System.currentTimeMillis()-start; LOGGER.info("解析完毕,耗时:"+cost+"毫秒"); } public static String parseFile(String file) { return parseFile(Paths.get(file)); } public static String parseFile(Path file) { try { if(invalid(file)){ return null; } String sourceName = file.toFile().getAbsolutePath(); String targetName = prepareTarget(file); LOGGER.info("处理文件:" + sourceName); LOGGER.info("生成文件:" + targetName); //解析文本 String text = parsePdfFileToPlainText(sourceName); //处理文本 List<String> sentences = toSentence(text); //保存文本 Files.write(Paths.get(targetName), sentences); return targetName.replace("src/main/resources", ""); }catch (Exception e){ e.printStackTrace(); } return ""; } public static boolean paragraphFinish(String line){ //遇到空白说明段落结束 if(StringUtils.isBlank(line)){ return true; } return false; } /** * 将文本分割为句子 * @param text * @return */ private static List<String> toSentence(String text){ List<String> data = new ArrayList<>(); StringBuilder paragraph = new StringBuilder(); //将PDF解析出来的文本按行分割 String[] lines = text.split("[\n\r]"); for (int i = 0; i < lines.length; i++) { String line = lines[i].trim(); //段落结束 if (paragraphFinish(line)) { process(paragraph.toString().trim(), data); //重置 paragraph.setLength(0); } LOGGER.debug("PDF"+(i+1)+"行:" + line); //移除行间连接符- while (line.endsWith("-") || line.endsWith("‐") || line.endsWith("‑") || line.endsWith("‒") || line.endsWith("–") || line.endsWith("—") || line.endsWith("―")) { LOGGER.debug("发现行被折断"); if ((i + 1) < lines.length) { //去除行末的- String pre = line.substring(0, line.length() - 1); //获取下一行 String n = lines[i+1].trim(); if(StringUtils.isNotBlank(n)){ LOGGER.debug("连接下一行"); line = pre+n; } LOGGER.debug("PDF"+(i+2)+"行:" + lines[i + 1]); i++; } else { LOGGER.debug("连接完毕"); break; } } //组装段落 String lastLine = null; String nextLine = null; if(i-1 > -1){ lastLine = lines[i-1].trim(); } if(i+1 < lines.length){ nextLine = lines[i+1].trim(); } addLineToParagraph(line, lastLine, nextLine, paragraph); } //内容结束 process(paragraph.toString(), data); return data; } private static void addLineToParagraph(String line, String lastLine, String nextLine, StringBuilder paragraph){ if(StringUtils.isBlank(line)){ return; } if(nextLine!=null){ //当前行是数字开头、字母结束 if(Character.isDigit(line.charAt(0)) && Character.isAlphabetic(line.charAt(line.length()-1)) //下一行为空或者是数字开头或者是大写字母开头 && (StringUtils.isBlank(nextLine) || Character.isDigit(nextLine.charAt(0)) || Character.isUpperCase(nextLine.charAt(0)))){ LOGGER.debug("忽略数字标题,不做分析:"+line); return; } } paragraph.append(line).append(" "); } public static boolean isProgramCode(String paragraph){ if(//Java代码 paragraph.startsWith("package") || paragraph.startsWith("import") || paragraph.startsWith("public") || paragraph.startsWith("private") || paragraph.startsWith("/**") || paragraph.contains(");") || paragraph.contains("}") || paragraph.contains("{") //html和xml标签 || paragraph.startsWith("<")){ return true; } return false; } private static void process(String paragraph, List<String> data){ if (StringUtils.isNotBlank(paragraph)) { LOGGER.debug("段落:" + paragraph); //检查段落合法性 if(paragraphValid(paragraph)) { //将段落切分为句子 List<String> sentences = segSentence(paragraph); if (!sentences.isEmpty()) { data.addAll(sentences); } } } } public static boolean paragraphValid(String paragraph){ //分析文本是否已经被损坏 //首字母可以不用检查 char[] chars = paragraph.toCharArray(); for(int i=1; i<chars.length; i++){ char c = chars[i]; /** 8208 ‐ 8209 ‑ 8210 ‒ 8211 – 8212 — 8213 ― 8214 ‖ 8215 ‗ 8216 ‘ 8217 ’ 8218 ‚ 8219 ‛ 8220 “ 8221 ” 8222 „ 8223 ‟ 8224 † 8225 ‡ 8226 • 8227 ‣ 8228 ․ 8229 ‥ 8230 … 8231 ‧ */ if(c >= 8208 && c <= 8231){ continue; } /** 32 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ */ if(c >= 32 && c <= 126){ continue; } /** * 64256 ff 64257 fi 64258 fl 64259 ffi 64260 ffl 64261 ſt 64262 st */ if(c >= 64256 && c <= 64262){ continue; } CORRUPT_CHAR.add(c); LOGGER.debug("忽略含有非法字符("+c+"="+(int)c+")的文本,字符下标:"+i+",不做分析:"+paragraph); return false; } if(isProgramCode(paragraph)){ LOGGER.debug("忽略程序代码,不做分析:"+paragraph); return false; } return true; } /** * 将一个段落切分为多个句子 * @param paragraph * @return */ private static List<String> segSentence(String paragraph){ List<String> data = new ArrayList<>(); //切分之前进行预处理 paragraph = prepareSeg(paragraph); if(StringUtils.isBlank(paragraph)){ return data; } //根据分隔符分割句子 for(String s : paragraph.split("[..。•]")) { if(StringUtils.isBlank(s)){ continue; } LOGGER.debug("处理句子:" + s); s = processSentence(s); if(s == null){ continue; } //加上句号 if(Character.isAlphabetic(s.charAt(s.length() - 1))){ s += "."; } //还原. s = s.replace("杨尚川", "."); data.add(s); LOGGER.debug("得到句子:" + s); if(LOGGER.isDebugEnabled()) { int length = s.split("\\s+").length; //统计句子长度分布情况 SENTENCE_LENGTH_INFO.putIfAbsent(length, new AtomicInteger()); SENTENCE_LENGTH_INFO.get(length).incrementAndGet(); } } return data; } public static String processSentence(String sentence){ //忽略空行 if(StringUtils.isBlank(sentence)){ LOGGER.debug("忽略没有内容的句子:" + sentence); return null; } sentence = sentence.trim(); if(sentence.endsWith(",")){ LOGGER.debug("以逗号结尾,不做分析:"+sentence); return null; } //移除行首的非字母字符 int i=0; for(char c : sentence.toCharArray()){ if(Character.isAlphabetic(c)){ break; } i++; } if(i>=sentence.length()){ LOGGER.debug("忽略没有字母的句子:" + sentence); return null; } if(i>0) { sentence = sentence.substring(i); } if(StringUtils.isBlank(sentence)){ LOGGER.debug("忽略没有内容的句子:" + sentence); return null; } //忽略首字母非大写的句子 if(!Character.isUpperCase(sentence.charAt(0))){ LOGGER.debug("忽略首字母非大写的句子:" + sentence); return null; } String[] words = sentence.split("\\s+"); if(words[0].length() == 1 && !"A".equals(words[0]) && !"I".equals(words[0])){ LOGGER.debug("忽略第一个单词不合法的句子:" + sentence); return null; } if(words[0].length() > 1 && StringUtils.isAllUpperCase(words[0])){ LOGGER.debug("忽略首单词全大写的句子:" + sentence); return null; } //判断句子长度 if(words.length < SENTENCE_WORD_MIN_COUNT){ LOGGER.debug("忽略长度小于" + SENTENCE_WORD_MIN_COUNT + "的句子:" + sentence); return null; } //判断是否最后一个单词是数字 if(StringUtils.isNumeric(words[words.length-1])){ LOGGER.debug("忽略最后一个单词是数字" + words[words.length-1] + "的句子:" + sentence); return null; } //判断句子中的大写字母开头的单词数 int capWordCount = 0; //最长单词 int maxWordCharCount = 0; for(String word : words){ if(Character.isUpperCase(word.charAt(0))){ capWordCount++; } if(!word.contains("http://") && word.length() > maxWordCharCount){ maxWordCharCount = word.length(); } } if(capWordCount > words.length*SENTENCE_CAP_WORD_MAX_RATE){ LOGGER.debug("忽略首字母大写单词数" + capWordCount + "多于" + words.length*SENTENCE_CAP_WORD_MAX_RATE + "的句子:" + sentence); return null; } if(maxWordCharCount > MAX_WORD_CHAR_COUNT){ LOGGER.debug("忽略有超长单词的句子,单词长度" + maxWordCharCount + "大于" + MAX_WORD_CHAR_COUNT + "的句子:" + sentence); return null; } //判断句子中的非字母单词数 int specialWordCount = 0; for(String word : words){ for(String c : punctuation){ word = word.replace(c, ""); } if(StringUtils.isNotBlank(word) && !StringUtils.isAlpha(word)){ LOGGER.debug("特殊非字母单词:"+word); specialWordCount++; } } if(specialWordCount > Math.log(words.length)/2){ LOGGER.debug("总次数:"+words.length+",忽略非字母单词数" + specialWordCount + "多于" + Math.log(words.length)/2 + "的句子:" + sentence); return null; } //不是单词的词数 int notWordCount = 0; Set<String> toCheck = TextAnalyzer.seg(sentence).stream().collect(Collectors.toSet()); LOGGER.debug("需要检查单词个数:"+toCheck.size()); for(String word : toCheck){ if(!DICTIONARY.contains(new Word(word.toLowerCase(), ""))){ LOGGER.debug("未知单词:"+word); notWordCount++; } } LOGGER.debug("未知的单词个数:"+notWordCount); if(notWordCount > toCheck.size()*0.4){ LOGGER.debug("待检查的单词在已有词典中不存在数" + notWordCount + "大于" + toCheck.size()*0.4 + "的句子:" + sentence); return null; } //检查[]()是否配对 if(sentence.contains("[") || sentence.contains("]") || sentence.contains("(") || sentence.contains(")") || sentence.contains("“") || sentence.contains("”") || sentence.contains("\"")){ char[] chars = sentence.toCharArray(); int pre=0; int suf=0; int quotCount=0; for(int j=0; j<chars.length; j++){ char c = chars[j]; switch (c){ case '[': LOGGER.debug("匹配:"+c+",下标:"+j);pre++;break; case '(': LOGGER.debug("匹配:"+c+",下标:"+j);pre++;break; case ']': LOGGER.debug("匹配:"+c+",下标:"+j);suf++;break; case ')': LOGGER.debug("匹配:"+c+",下标:"+j);suf++;break; case '“': LOGGER.debug("匹配:"+c+",下标:"+j);pre++;break; case '”': LOGGER.debug("匹配:"+c+",下标:"+j);suf++;break; case '"': LOGGER.debug("匹配:"+c+",下标:"+j);quotCount++;break; } } if(pre != suf){ LOGGER.debug("[]()配对检查失败,前向数:"+pre+",后向数:"+suf); return null; } if(quotCount%2==1){ LOGGER.debug("[]()配对检查失败,双引号数:"+quotCount); return null; } } return sentence; } /** * 检查段落是否合法,不合法则放弃切分句子 * @param paragraph * @return */ private static String prepareSeg(String paragraph){ paragraph = paragraph.replace(".)", ". "); paragraph = paragraph.replace("!)", ". "); if(paragraph.contains(".")) { paragraph = paragraph.trim(); StringBuilder data = new StringBuilder(); int index = 0; int last = 0; boolean r = false; while ((index = paragraph.indexOf(".", index)) > -1) { boolean remain = false; if(index+1 < paragraph.length()){ if(Character.isWhitespace(paragraph.charAt(index+1))){ remain = true; } } if(index == paragraph.length()-1){ remain = true; } if(!remain){ data.append(paragraph.substring(last, index)).append("杨尚川"); r = true; }else{ data.append(paragraph.substring(last, index+1)); } index++; last = index; } if (last < paragraph.length()) { data.append(paragraph.substring(last, paragraph.length())); } paragraph = data.toString(); if(r){ LOGGER.debug("将.替换之后:"+paragraph); } } return paragraph; } /** * 检查文件是否有效,只处理PDF文档 * @param file * @return */ private static boolean invalid(Path file){ if (file.toFile().getName().startsWith(".")) { return true; } String fileName = file.toFile().getAbsolutePath(); if (!fileName.endsWith(".pdf")) { LOGGER.info("放弃处理非PDF文件:" + fileName); return true; } return false; } /** * 将PDF解析之后的文本保存到哪个文件 * @param file * @return */ private static String prepareTarget(Path file){ try { String fileName = file.toFile().getAbsolutePath(); String targetName = "src/main/resources/it" + fileName.replace(file.getParent().getParent().toFile().getAbsolutePath(), "").replace(".pdf", "") + ".txt"; Path target = Paths.get(targetName); //删除以前生成的文件 Files.deleteIfExists(target); //准备目录结构 if (Files.notExists(target.getParent())) { Files.createDirectories(target.getParent()); } return targetName; }catch (Exception e){ e.printStackTrace(); } return null; } public static void resetSentenceWordLengthInfo(){ SENTENCE_LENGTH_INFO.clear(); } public static void showSentenceWordLengthInfo(){ LOGGER.debug("句子词长分布:"); SENTENCE_LENGTH_INFO .keySet() .stream() .sorted() .forEach(k -> { LOGGER.debug(k + " -> " + SENTENCE_LENGTH_INFO.get(k)); }); LOGGER.debug("未识别的字符:"); CORRUPT_CHAR.stream().sorted().forEach(c -> LOGGER.debug((int)c+"="+c.toString())); } public static void main(String[] args) throws Exception{ resetSentenceWordLengthInfo(); //提取文件 //String file = "/Users/apple/百度云同步盘/【大数据】相关技术英文原版电子书/activemq/ActiveMQ in Action.pdf"; //parseFile(file); //提取子类别 //String path = "/Users/apple/百度云同步盘/【大数据】相关技术英文原版电子书/cassandra"; //提取所有类别 //String path = "/Users/apple/百度云同步盘/【大数据】相关技术英文原版电子书"; //提取目录 //parseDirectory(path); //it-software-domain.zip文件249本IT领域中和软件开发相关的249本电子书 //大多数书都跟大数据和搜索引擎有关系,因为这是我的研究方向 //it-software-domain.zip的下载地址:http://pan.baidu.com/s/1kT1NA3l parseZip("/Users/apple/百度云同步盘/【大数据】相关技术英文原版电子书/it-software-domain.zip"); showSentenceWordLengthInfo(); } }