bug fix and ready for release

WangHexie · Apr 10, 2020 · d09ae54 · d09ae54
1 parent 306fb17
commit d09ae54
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,33 @@
+# 情感分类（sentiment classification）
+> 文本情感分类
+
+## Requirements
+
+```sh
+pip install -r requirements.txt
+git clone https://github.com/WangHexie/ktrain.git
+cd ktrain && pip install .
+```
+
+## Usage example
+
+```shell script
+python ./main.py --traning-dataset ./data --prediction-file "./data/test.csv" --test-dataset ./data/test  --max_len 80 --model hfl/chinese-roberta-wwm-ext --batch_size 196
+```
+
+```shell script
+usage: sentiment classification [-h] [--traning-dataset TRANING_DATASET]
+                                [--prediction-file PREDICTION_FILE]
+                                [--test-dataset TEST_DATASET] [--model MODEL]
+                                [--batch_size BATCH_SIZE] [--max_len MAX_LEN]
+                                [--temp_path TEMP_PATH]
+                                [--early_stopping EARLY_STOPPING]
+                                [--reduce_on_plateau REDUCE_ON_PLATEAU]
+
+```
+
+
+
+
+
+
diff --git a/data/labels.csv b/data/labels.csv
@@ -0,0 +1,7 @@
+开心,兴奋,
+,,
+,,
+开心,,
+,,
+焦虑,开心,
+,,
diff --git a/data/texts.csv b/data/texts.csv
@@ -0,0 +1,7 @@
+大法开机无广告
+"想多了吧如若不强制 等于无效"
+"说实话网格员 有点浪费了"
+每个女孩节日快乐！！！
+我就也点赞了一个
+条件别满足了
+高三初三可以考虑早一点开学
diff --git a/main.py b/main.py
@@ -2,7 +2,7 @@
 from dataclasses import asdict
 
 from src.config.configs import ClassifierParam
-from src.submit import prediction_all, classifier_ensemble
+from src.submit import prediction_all
 
 parser = argparse.ArgumentParser("sentiment classification", fromfile_prefix_chars='@')
 parser.add_argument('--traning-dataset', type=str, help='train_dataset input path')
@@ -19,26 +19,11 @@
 
 args = parser.parse_args()
 
-prediction_paths = ["./test1.csv", "./test2.csv"]
-
 prediction_all(test_dir=args.test_dataset,
                train_file_dir=args.traning_dataset,
-               prediction_file_path=prediction_paths[0],
+               prediction_file_path=args.prediction_file,
                config=asdict(
                    ClassifierParam(epochs=30, batch_size=args.batch_size, max_len=args.max_len, model_name=args.model,
                                    learning_rate=1e-5, early_stopping=args.early_stopping,
                                    reduce_on_plateau=args.reduce_on_plateau)),
                temp_path=args.temp_path)
-
-prediction_all(test_dir=args.test_dataset,
-               train_file_dir=args.traning_dataset,
-               prediction_file_path=prediction_paths[1],
-               config=asdict(
-                   ClassifierParam(epochs=30, batch_size=156, max_len=85, model_name=args.model,
-                                   learning_rate=1e-5, early_stopping=args.early_stopping,
-                                   reduce_on_plateau=args.reduce_on_plateau)),
-               temp_path=args.temp_path)
-
-
-classifier_ensemble(prediction_paths, output_path=args.prediction_file)
-
diff --git a/src/data/dataset.py b/src/data/dataset.py
@@ -39,7 +39,7 @@ def reverse_transform_one_hot_to_label(df_prediction):
 
     @staticmethod
     def read_one_hot_label(label_path=FilePath.label_path, mode="train"):
-        return Dataset._transform_original_label_to_one_hot(Dataset.read_original_label(label_path, mode=mode).values)
+        return Dataset._transform_original_label_to_one_hot(Dataset.read_original_label(label_path, mode=mode).values).drop(columns=[''])
 
     @staticmethod
     def read_splitted_train_file(path):