Skip to content

Commit

Permalink
Merge pull request #152 from bigdata-ustc/dev
Browse files Browse the repository at this point in the history
[FEATURE] Upgrade version to 1.0.0
  • Loading branch information
KenelmQLH committed Jan 13, 2024
2 parents ee29eb6 + 598d788 commit 95cd9fc
Show file tree
Hide file tree
Showing 63 changed files with 5,282 additions and 491 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/python-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@ on: [push, pull_request]
jobs:
build:

runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
include:
- os: "ubuntu-latest"
- os: "ubuntu-20.04"
python-version: "3.6"

steps:
- uses: actions/checkout@v2
Expand All @@ -24,4 +28,4 @@ jobs:
- name: Test with pytest
run: |
pytest
codecov
codecov
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@

[Jundong Wu](https://github.com/wintermelon008)

[Shangzi Xue](https://github.com/ShangziXue)

The stared contributors are the corresponding authors.
6 changes: 6 additions & 0 deletions CHANGE.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
v1.0.0
1. Support cuda for I2V and T2V.
2. Add demos for downstream tasks including knowledge & difficulty & discrimination prediction, similarity prediction and paper segmentation.
3. Refactor quesnet for pretrain and vectorization.
4. Update documents about tutorials and API.

v0.0.9
1. Refactor tokenizer Basic Tokenizer and Pretrained Tokenizer
2. Refactor model structures following huggingface styles for Elmo, BERT, DisenQNet and QuesNet
Expand Down
60 changes: 44 additions & 16 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# coding: utf-8
# 2021/8/1 @ tongshiwei

import torch
import json
import os.path
from typing import List, Tuple
Expand Down Expand Up @@ -59,12 +60,12 @@ class I2V(object):
"""

def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
pretrained_t2v=False, model_dir=MODEL_DIR, **kwargs):
pretrained_t2v=False, model_dir=MODEL_DIR, device='cpu', **kwargs):
if pretrained_t2v:
logger.info("Use pretrained t2v model %s" % t2v)
self.t2v = get_t2v_pretrained_model(t2v, model_dir)
self.t2v = get_t2v_pretrained_model(t2v, model_dir, device)
else:
self.t2v = T2V(t2v, *args, **kwargs)
self.t2v = T2V(t2v, device=device, *args, **kwargs)
if tokenizer == 'bert':
self.tokenizer = BertTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
Expand All @@ -82,31 +83,53 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
self.params = {
"tokenizer": tokenizer,
"tokenizer_kwargs": tokenizer_kwargs,
"t2v": t2v,
"args": args,
"tokenizer_kwargs": tokenizer_kwargs,
"pretrained_t2v": pretrained_t2v,
"model_dir": model_dir,
"kwargs": kwargs,
"pretrained_t2v": pretrained_t2v
}
self.device = torch.device(device)

def __call__(self, items, *args, **kwargs):
"""transfer item to vector"""
return self.infer_vector(items, *args, **kwargs)

def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list:
# """tokenize item"""
"""
tokenize item
Parameter
----------
items: a list of questions
Return
----------
tokens: list
"""
return self.tokenizer(items, *args, key=key, **kwargs)

def infer_vector(self, items, key=lambda x: x, **kwargs) -> tuple:
"""
get question embedding
NotImplemented
"""
raise NotImplementedError

def infer_item_vector(self, tokens, *args, **kwargs) -> ...:
"""NotImplemented"""
return self.infer_vector(tokens, *args, **kwargs)[0]

def infer_token_vector(self, tokens, *args, **kwargs) -> ...:
"""NotImplemented"""
return self.infer_vector(tokens, *args, **kwargs)[1]

def save(self, config_path):
"""
save model weights in config_path
Parameter:
----------
config_path: str
"""
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.params, wf, ensure_ascii=False, indent=2)

Expand All @@ -123,6 +146,7 @@ def load(cls, config_path, *args, **kwargs):

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
"""NotImplemented"""
raise NotImplementedError

@property
Expand Down Expand Up @@ -327,13 +351,13 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
model_path = model_path.replace(i, "")
logger.info("model_path: %s" % model_path)
tokenizer_kwargs = {"tokenizer_config_dir": model_path}
return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir,
return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir, device=device,
tokenizer_kwargs=tokenizer_kwargs)


Expand Down Expand Up @@ -386,17 +410,19 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
--------
vector:list
"""
is_batch = isinstance(items, list)
items = items if is_batch else [items]
inputs = self.tokenize(items, key=key, return_tensors=return_tensors)
return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
model_path = model_path.replace(i, "")
logger.info("model_path: %s" % model_path)
tokenizer_kwargs = {"tokenizer_config_dir": model_path}
return cls("bert", name, pretrained_t2v=True, model_dir=model_dir,
return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, device=device,
tokenizer_kwargs=tokenizer_kwargs)


Expand Down Expand Up @@ -452,7 +478,7 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
return i_vec, t_vec

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs):
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', **kwargs):
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
model_path = model_path.replace(i, "")
Expand All @@ -461,7 +487,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs):
tokenizer_kwargs = {
"tokenizer_config_dir": model_path,
}
return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir,
return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir, device=device,
tokenizer_kwargs=tokenizer_kwargs, **kwargs)


Expand Down Expand Up @@ -495,18 +521,20 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
token embeddings
question embedding
"""
is_batch = isinstance(items, list)
items = items if is_batch else [items]
encodes = self.tokenize(items, key=key, meta=meta, *args, **kwargs)
return self.t2v.infer_vector(encodes), self.t2v.infer_tokens(encodes)

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
model_path = model_path.replace(i, "")
logger.info("model_path: %s" % model_path)
tokenizer_kwargs = {
"tokenizer_config_dir": model_path}
return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir,
return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir, device=device,
tokenizer_kwargs=tokenizer_kwargs)


Expand All @@ -520,7 +548,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
}


def get_pretrained_i2v(name, model_dir=MODEL_DIR):
def get_pretrained_i2v(name, model_dir=MODEL_DIR, device='cpu'):
"""
It is a good idea if you want to switch item to vector earily.
Expand Down Expand Up @@ -560,4 +588,4 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR):
)
_, t2v = get_pretrained_model_info(name)
_class, *params = MODEL_MAP[t2v], name
return _class.from_pretrained(*params, model_dir=model_dir)
return _class.from_pretrained(*params, model_dir=model_dir, device=device)
2 changes: 2 additions & 0 deletions EduNLP/ModelZoo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .utils import *
from .bert import *
from .rnn import *
from .disenqnet import *
from .quesnet import *
2 changes: 1 addition & 1 deletion EduNLP/ModelZoo/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def from_pretrained(cls, pretrained_model_path, *args, **kwargs):
config_path = os.path.join(pretrained_model_path, "config.json")
model_path = os.path.join(pretrained_model_path, "pytorch_model.bin")
model = cls.from_config(config_path, *args, **kwargs)
loaded_state_dict = torch.load(model_path)
loaded_state_dict = torch.load(model_path, map_location=torch.device('cpu'))
loaded_keys = loaded_state_dict.keys()
expected_keys = model.state_dict().keys()

Expand Down
71 changes: 42 additions & 29 deletions EduNLP/ModelZoo/bert/bert.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,35 @@
import torch
from torch import nn
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from baize.torch import load_net
import torch.nn.functional as F
import json
import os
from ..base_model import BaseModel
from transformers.modeling_outputs import ModelOutput
from transformers import BertModel, PretrainedConfig
from typing import List, Optional
from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput
from transformers import BertModel, PretrainedConfig, BertConfig
from typing import List
from ..rnn.harnn import HAM

__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"]


class BertForPPOutput(ModelOutput):
loss: torch.FloatTensor = None
logits: torch.FloatTensor = None
__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"]


class BertForPropertyPrediction(BaseModel):
def __init__(self, pretrained_model_dir=None, head_dropout=0.5):
def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True):
super(BertForPropertyPrediction, self).__init__()
self.bert = BertModel.from_pretrained(pretrained_model_dir)
bert_config = BertConfig.from_pretrained(pretrained_model_dir)
if init:
print(f'Load BertModel from checkpoint: {pretrained_model_dir}')
self.bert = BertModel.from_pretrained(pretrained_model_dir)
else:
print(f'Load BertModel from config: {pretrained_model_dir}')
self.bert = BertModel(bert_config)
self.hidden_size = self.bert.config.hidden_size
self.head_dropout = head_dropout
self.dropout = nn.Dropout(head_dropout)
self.classifier = nn.Linear(self.hidden_size, 1)
self.sigmoid = nn.Sigmoid()
self.criterion = nn.MSELoss()

self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]}
self.config['architecture'] = 'BertForPropertyPrediction'
self.config = PretrainedConfig.from_dict(self.config)

Expand All @@ -47,44 +46,54 @@ def forward(self,
loss = None
if labels is not None:
loss = self.criterion(logits, labels) if labels is not None else None
return BertForPPOutput(
return PropertyPredictionOutput(
loss=loss,
logits=logits,
)

@classmethod
def from_config(cls, config_path, **kwargs):
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
with open(config_path, "r", encoding="utf-8") as rf:
model_config = json.load(rf)
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
model_config.update(kwargs)
return cls(
pretrained_model_dir=model_config['pretrained_model_dir'],
head_dropout=model_config.get("head_dropout", 0.5)
head_dropout=model_config.get("head_dropout", 0.5),
init=model_config.get('init', False)
)

# @classmethod
# def from_pretrained(cls):
# NotImplementedError
# # 需要验证是否和huggingface的模型兼容
def save_config(self, config_dir):
config_path = os.path.join(config_dir, "model_config.json")
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
self.bert.config.save_pretrained(config_dir)


class BertForKnowledgePrediction(BaseModel):
def __init__(self,
pretrained_model_dir=None,
num_classes_list: List[int] = None,
num_total_classes: int = None,
pretrained_model_dir=None,
head_dropout=0.5,
flat_cls_weight=0.5,
attention_unit_size=256,
fc_hidden_size=512,
beta=0.5,
init=True
):
super(BertForKnowledgePrediction, self).__init__()
self.bert = BertModel.from_pretrained(pretrained_model_dir)
bert_config = BertConfig.from_pretrained(pretrained_model_dir)
if init:
print(f'Load BertModel from checkpoint: {pretrained_model_dir}')
self.bert = BertModel.from_pretrained(pretrained_model_dir)
else:
print(f'Load BertModel from config: {pretrained_model_dir}')
self.bert = BertModel(bert_config)
self.hidden_size = self.bert.config.hidden_size
self.head_dropout = head_dropout
self.dropout = nn.Dropout(head_dropout)
self.classifier = nn.Linear(self.hidden_size, 1)
self.sigmoid = nn.Sigmoid()
self.criterion = nn.MSELoss()
self.flat_classifier = nn.Linear(self.hidden_size, num_total_classes)
Expand All @@ -101,7 +110,7 @@ def __init__(self,
self.num_classes_list = num_classes_list
self.num_total_classes = num_total_classes

self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]}
self.config['architecture'] = 'BertForKnowledgePrediction'
self.config = PretrainedConfig.from_dict(self.config)

Expand All @@ -124,15 +133,17 @@ def forward(self,
labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1)
labels = labels.float()
loss = self.criterion(logits, labels) if labels is not None else None
return BertForPPOutput(
return KnowledgePredictionOutput(
loss=loss,
logits=logits,
)

@classmethod
def from_config(cls, config_path, **kwargs):
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
with open(config_path, "r", encoding="utf-8") as rf:
model_config = json.load(rf)
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
model_config.update(kwargs)
return cls(
pretrained_model_dir=model_config['pretrained_model_dir'],
Expand All @@ -143,9 +154,11 @@ def from_config(cls, config_path, **kwargs):
attention_unit_size=model_config.get('attention_unit_size', 256),
fc_hidden_size=model_config.get('fc_hidden_size', 512),
beta=model_config.get('beta', 0.5),
init=model_config.get('init', False)
)

# @classmethod
# def from_pretrained(cls):
# NotImplementedError
# # 需要验证是否和huggingface的模型兼容
def save_config(self, config_dir):
config_path = os.path.join(config_dir, "model_config.json")
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
self.bert.config.save_pretrained(config_dir)
Loading

0 comments on commit 95cd9fc

Please sign in to comment.