package test.load.pdftxt; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.LineNumberReader; import java.util.regex.Pattern; public class PDFFormatFileScanToFormat { public static void main(String[] args) { try { String fromFile = "d:\\L.txt"; new PDFFormatFileScanToFormat().deal(fromFile); } catch (Exception e) { e.printStackTrace(); } } static final int PAGE_HEADER = 10; static final int BEGIN_PAGE_HEADER = 202; static final int BEGINE_LINE = 201; static final int PAGE_LINE = 15; static final int EMPTY_LINE = 100; LineNumberReader in = null; FileWriter out = null; public void deal(String fromFile) throws Exception { String toFile = fromFile.replaceAll("[.]", "." + System.currentTimeMillis() + "."); FileReader fr = new FileReader(fromFile); InputCharStream in = new InputCharStream(fr); out = new FileWriter(toFile, false); int status = 0; int lastIndent = 0; int indent = 0; status = BEGINE_LINE; String line = ""; String lastLine = ""; int from = 0, to = 0; String indentSpace = ""; String leadNum = ""; // int cntLine = 0; boolean newLined = false; try { OutterWhile: while (true) { if (in.LA(1) > 0X8000) { in.consume(); continue; } indent = 0; line = ""; indentSpace = ""; leadNum = ""; switch (in.LA(1)) { case '-': indent = lastIndent; from = in.index(); line = toLineEnd(in); System.out.println("CLEAR |" + line); from = in.index(); toLineEnd(in); assert to - from < 1; System.out.println("CLEAR |"); status = PAGE_HEADER; line = lastLine; break; case '\r': in.consume(); case '\n': in.consume(); if (status == PAGE_LINE) { if (!newLined) this.newLine("case '\n' status == PAGE_LINE"); newLined = true; } status = EMPTY_LINE; line = ""; this.newLine("case '\n' status == EMPTY_LINE"); newLined = true; break; case -1: break OutterWhile; case ' ': from = in.index(); while ((c = in.LA(1)) == ' ') in.consume(); indent = in.index() - from; if (in.index() > from) { indentSpace = in.substring(from, in.index() - 1); } default: from = in.index(); consumeLeadingNumber(in); leadNum = ""; indent += in.index() - from; if (in.index() > from) { leadNum = in.substring(from, in.index() - 1); } // Empty Line from = in.index(); line = toLineEnd(in); if (line.length() == 0) { if (status == PAGE_LINE) { this.newLine("default: if (status == PAGE_LINE) {"); newLined = true; } status = EMPTY_LINE; line = ""; this.newLine("default: status = EMPTY_LINE;"); newLined = true; indentSpace = ""; leadNum = ""; break; } if (status == PAGE_HEADER && indent > 30 && pPageBegin.matcher(line).find()) { status = PAGE_HEADER; System.out.println("CLEAR |" + indentSpace + leadNum + line); line = lastLine; break; } if (!newLined) { if (Math.abs(lastIndent - indent) > 1) { this.newLine("Math.abs"); this.write(indentSpace); } else if (lastIndent > 0 && line.length() > 0) { // System.out.println("[[" + // pLineNum.matcher(lastLine).find() + "]] -- |" + // lastLine); if (pLineNum.matcher(lastLine).find()) { this.newLine("lastIndent > 0"); this.write(indentSpace); } else { this.write(" "); } } else { this.write(" "); } } else { this.write(indentSpace); } status = PAGE_LINE; if (line.length() > 20 && ((line.charAt(line.length() - 1) == '-' && Character.isLetter(line.charAt(line.length() - 2)) || (line.charAt(line.length() - 2) == '-' && Character .isLetter(line.charAt(line.length() - 2)))))) { System.out.println("****[[" + pLineEndHalf.matcher(line).hitEnd() + "]] -- |" + line); // if (pLineEndHalf.matcher(line).hitEnd()) { line = line.substring(0, line.length() - 2); // } } this.write(leadNum + line); newLined = false; indentSpace = ""; leadNum = ""; break; } lastIndent = indent; lastLine = line; // cntLine++; // if ((cntLine >> 8) << 8 == cntLine) { // System.out.println(String.valueOf(cntLine)); // } } } finally { out.close(); } } static final Pattern pLineEndHalf = Pattern.compile("[A-Za-z\\.\\,\\;\\'\\\" ]+[-] ?"); static final Pattern pPageBegin = Pattern.compile("[A-Z][A-Z][A-Z]*");// (\\([0-9a-kA-Z]*\\))?[A-Z]* // *[0-9]* // *"); static final Pattern pLineNum = Pattern.compile("(\\. )(\\. )+[\\. ]+[0-9]+ *"); int consumeLeadingSpace(InputCharStream in) { int from = in.index(); int to = in.index(); while ((c = in.LA(1)) == ' ') ; assert to > from; return from - to; } String consumeLeadingNumber(InputCharStream in) { int from = in.index(); int to = in.index(); Outter: while ((c = in.LA(1)) > 0) { switch (c) { case '0' - '9': break; case '•': break; case '.': break; default: break Outter; } in.consume(); } to = in.index(); if (to > from) { return in.substring(from, to - 1); } else { return ""; } } String toLineEnd(InputCharStream in) { int from = in.index(); int to = in.index(); Outter: while ((c = in.LA(1)) > 0) { switch (c) { case '\r': to = in.index() - 1; in.consume(); break; case '\n': if (to <= from) to = in.index() - 1; in.consume(); break Outter; default: in.consume(); break; } } assert to >= from; if (to > from) { return in.substring(from, to - 1); } else { return ""; } } int status = 0; int c = -1; void nextLine() { try { c = in.read(); } catch (IOException e) { throw new RuntimeException(); } } void write(String sline) { try { out.write(sline); } catch (IOException e) { throw new RuntimeException(); } } void newLine(String location) { try { out.write("\r\n");// + "[["+ location +"]]"); } catch (IOException e) { throw new RuntimeException(); } } }