diff --git a/pom.xml b/pom.xml
index a29c51e6e..de65e9474 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
com.hankcs
hanlp
- 1.2.4
+ 1.2.5
HanLP
http://www.hankcs.com/
@@ -79,6 +79,15 @@
+
+
+ org.apache.maven.plugins
+ maven-failsafe-plugin
+ 2.18.1
+
+ true
+
+
diff --git a/src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java b/src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java
new file mode 100644
index 000000000..52c29a99e
--- /dev/null
+++ b/src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java
@@ -0,0 +1,114 @@
+/*
+ *
+ * He Han
+ * me@hankcs.com
+ * 2015/7/29 16:35
+ *
+ *
+ * Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
+ * This source is subject to Hankcs. Please contact Hankcs to get more information.
+ *
+ */
+package com.hankcs.hanlp.corpus.io;
+
+import java.io.File;
+import java.io.FileFilter;
+
+/**
+ * 文本读取工具
+ * @author hankcs
+ */
+public class EasyReader
+{
+ /**
+ * 根目录
+ */
+ String root;
+ /**
+ * 是否输出进度
+ */
+ boolean verbose = true;
+
+ /**
+ * 构造
+ * @param root 根目录
+ */
+ public EasyReader(String root)
+ {
+ this.root = root;
+ }
+
+ /**
+ * 构造
+ * @param root 根目录
+ * @param verbose 是否输出进度
+ */
+ public EasyReader(String root, boolean verbose)
+ {
+ this.root = root;
+ this.verbose = verbose;
+ }
+
+ /**
+ * 读取
+ * @param handler 处理逻辑
+ * @param size 读取多少个文件
+ * @throws Exception
+ */
+ public void read(LineHandler handler, int size) throws Exception
+ {
+ File rootFile = new File(root);
+ File[] files;
+ if (rootFile.isDirectory())
+ {
+ files = rootFile.listFiles(new FileFilter()
+ {
+ @Override
+ public boolean accept(File pathname)
+ {
+ return pathname.isFile() && !pathname.getName().endsWith(".bin");
+ }
+ });
+ if (files == null)
+ {
+ if (rootFile.isFile())
+ files = new File[]{rootFile};
+ else return;
+ }
+ }
+ else
+ {
+ files = new File[]{rootFile};
+ }
+
+ int n = 0;
+ int totalAddress = 0;
+ long start = System.currentTimeMillis();
+ for (File file : files)
+ {
+ if (size-- == 0) break;
+ if (file.isDirectory()) continue;
+ if (verbose) System.out.printf("正在处理%s, %d / %d\n", file.getName(), ++n, files.length);
+ IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(file.getAbsolutePath());
+ while (lineIterator.hasNext())
+ {
+ ++totalAddress;
+ String line = lineIterator.next();
+ if (line.length() == 0) continue;
+ handler.handle(line);
+ }
+ }
+ handler.done();
+ if (verbose) System.out.printf("处理了 %.2f 万行,花费了 %.2f min\n", totalAddress / 10000.0, (System.currentTimeMillis() - start) / 1000.0 / 60.0);
+ }
+
+ /**
+ * 读取
+ * @param handler 处理逻辑
+ * @throws Exception
+ */
+ public void read(LineHandler handler) throws Exception
+ {
+ read(handler, Integer.MAX_VALUE);
+ }
+}
diff --git a/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java b/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java
index 0486eb9ed..b235966ac 100644
--- a/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java
+++ b/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java
@@ -361,4 +361,61 @@ public void remove()
throw new UnsupportedOperationException("只读,不可写!");
}
}
+
+ /**
+ * 创建一个BufferedWriter
+ *
+ * @param path
+ * @return
+ * @throws FileNotFoundException
+ * @throws UnsupportedEncodingException
+ */
+ public static BufferedWriter newBufferedWriter(String path) throws FileNotFoundException, UnsupportedEncodingException
+ {
+ return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"));
+ }
+
+ /**
+ * 创建一个BufferedReader
+ * @param path
+ * @return
+ * @throws FileNotFoundException
+ * @throws UnsupportedEncodingException
+ */
+ public static BufferedReader newBufferedReader(String path) throws FileNotFoundException, UnsupportedEncodingException
+ {
+ return new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
+ }
+
+ public static BufferedWriter newBufferedWriter(String path, boolean append) throws FileNotFoundException, UnsupportedEncodingException
+ {
+ return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, append), "UTF-8"));
+ }
+
+ /**
+ * 获取最后一个分隔符的后缀
+ * @param name
+ * @param delimiter
+ * @return
+ */
+ public static String getSuffix(String name, String delimiter)
+ {
+ return name.substring(name.lastIndexOf(delimiter) + 1);
+ }
+
+ /**
+ * 写数组,用制表符分割
+ * @param bw
+ * @param params
+ * @throws IOException
+ */
+ public static void writeLine(BufferedWriter bw, String... params) throws IOException
+ {
+ for (int i = 0; i < params.length - 1; i++)
+ {
+ bw.write(params[i]);
+ bw.write('\t');
+ }
+ bw.write(params[params.length - 1]);
+ }
}
diff --git a/src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java b/src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java
new file mode 100644
index 000000000..7ae7f57f3
--- /dev/null
+++ b/src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java
@@ -0,0 +1,54 @@
+/*
+ *
+ * He Han
+ * me@hankcs.com
+ * 2015/7/29 16:37
+ *
+ *
+ * Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
+ * This source is subject to Hankcs. Please contact Hankcs to get more information.
+ *
+ */
+package com.hankcs.hanlp.corpus.io;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * @author hankcs
+ */
+public abstract class LineHandler
+{
+ String delimiter = "\t";
+
+ public LineHandler(String delimiter)
+ {
+ this.delimiter = delimiter;
+ }
+
+ public LineHandler()
+ {
+ }
+
+ public void handle(String line) throws Exception
+ {
+ List tokenList = new LinkedList();
+ int start = 0;
+ int end;
+ while ((end = line.indexOf(delimiter, start)) != -1)
+ {
+ tokenList.add(line.substring(start, end));
+ start = end + 1;
+ }
+ tokenList.add(line.substring(start, line.length()));
+ handle(tokenList.toArray(new String[0]));
+ }
+
+ public void done() throws IOException
+ {
+ // do noting
+ }
+
+ public abstract void handle(String[] params) throws IOException;
+}
diff --git a/src/test/java/com/hankcs/test/model/TestCRF.java b/src/test/java/com/hankcs/test/model/TestCRF.java
index 96be8e2e9..7106efe74 100644
--- a/src/test/java/com/hankcs/test/model/TestCRF.java
+++ b/src/test/java/com/hankcs/test/model/TestCRF.java
@@ -13,11 +13,15 @@
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
+import com.hankcs.hanlp.corpus.dictionary.EasyDictionary;
import com.hankcs.hanlp.corpus.document.CorpusLoader;
import com.hankcs.hanlp.corpus.document.Document;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.io.ByteArray;
+import com.hankcs.hanlp.corpus.io.EasyReader;
+import com.hankcs.hanlp.corpus.io.IOUtil;
+import com.hankcs.hanlp.corpus.io.LineHandler;
import com.hankcs.hanlp.corpus.util.Precompiler;
import com.hankcs.hanlp.model.crf.FeatureFunction;
import com.hankcs.hanlp.model.crf.FeatureTemplate;
@@ -93,8 +97,8 @@ public void testSegment() throws Exception
*/
public void testPrepareCRFTrainingCorpus() throws Exception
{
- final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014人民日报语料BMES切分.txt"), "UTF-8"));
- CorpusLoader.walk("H:\\seg_corpus", new CorpusLoader.Handler()
+ final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("e:\\2014.txt"), "UTF-8"));
+ CorpusLoader.walk("D:\\Doc\\语料库\\2014_hankcs", new CorpusLoader.Handler()
{
@Override
public void handle(Document document)
@@ -102,8 +106,10 @@ public void handle(Document document)
try
{
List> sentenceList = document.getSimpleSentenceList();
+ if (sentenceList.size() == 0) return;
for (List sentence : sentenceList)
{
+ if (sentence.size() == 0) continue;
for (IWord iWord : sentence)
{
String word = iWord.getValue();
@@ -118,28 +124,28 @@ public void handle(Document document)
bw.write(word);
bw.write('\t');
bw.write('S');
- bw.newLine();
+ bw.write('\n');
}
else
{
bw.write(word.charAt(0));
bw.write('\t');
bw.write('B');
- bw.newLine();
+ bw.write('\n');
for (int i = 1; i < word.length() - 1; ++i)
{
bw.write(word.charAt(i));
bw.write('\t');
bw.write('M');
- bw.newLine();
+ bw.write('\n');
}
bw.write(word.charAt(word.length() - 1));
bw.write('\t');
bw.write('E');
- bw.newLine();
+ bw.write('\n');
}
}
- bw.newLine();
+ bw.write('\n');
}
}
catch (IOException e)
@@ -187,4 +193,22 @@ public void testLoadModelWithBiGramFeature() throws Exception
model.tag(table);
System.out.println(table);
}
+
+ public void testRemoveSpace() throws Exception
+ {
+ String inputPath = "E:\\2014.txt";
+ String outputPath = "E:\\2014f.txt";
+ BufferedReader br = IOUtil.newBufferedReader(inputPath);
+ BufferedWriter bw = IOUtil.newBufferedWriter(outputPath);
+ String line = "";
+ int preLength = 0;
+ while ((line = br.readLine()) != null)
+ {
+ if (preLength == 0 && line.length() == 0) continue;
+ bw.write(line);
+ bw.newLine();
+ preLength = line.length();
+ }
+ bw.close();
+ }
}