Skip to content

Commit

Permalink
Merge pull request #160 from Bone-Fish/dev
Browse files Browse the repository at this point in the history
[FEATURE] Add Jiuzhang model
  • Loading branch information
nnnyt authored Mar 19, 2024
2 parents 7abc7d1 + e05f640 commit ddc432a
Show file tree
Hide file tree
Showing 13 changed files with 2,010 additions and 5 deletions.
3 changes: 2 additions & 1 deletion AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@
[Heng Yu](https://github.com/GNEHUY)

[Tianyun Ji](https://github.com/KINGNEWBLUSH)
The stared contributors are the corresponding authors.

[Chaokun Wang](https://github.com/Bone-Fish)
2 changes: 1 addition & 1 deletion EduNLP/I2V/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# 2021/8/1 @ tongshiwei

from .i2v import I2V, get_pretrained_i2v
from .i2v import D2V, W2V, Elmo, Bert, HfAuto, DisenQ, QuesNet
from .i2v import D2V, W2V, Elmo, Bert, HfAuto, DisenQ, QuesNet, Jiuzhang
76 changes: 73 additions & 3 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
from longling import path_append
from EduData import get_data
from ..Tokenizer import Tokenizer, get_tokenizer
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, HfAutoTokenizer, DisenQTokenizer, QuesNetTokenizer, Question
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, HfAutoTokenizer
from EduNLP.Pretrain import DisenQTokenizer, QuesNetTokenizer, JiuzhangTokenizer
from EduNLP import logger

__all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "HfAuto", "DisenQ", "QuesNet", "get_pretrained_i2v"]
__all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "HfAuto", "DisenQ", "QuesNet", "get_pretrained_i2v", "Jiuzhang"]


class I2V(object):
Expand Down Expand Up @@ -69,6 +70,9 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
if tokenizer == 'bert':
self.tokenizer = BertTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
elif tokenizer == 'jiuzhang':
self.tokenizer = JiuzhangTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
elif tokenizer == 'hf_auto':
self.tokenizer = HfAutoTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
Expand Down Expand Up @@ -606,14 +610,80 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwarg
tokenizer_kwargs=tokenizer_kwargs)


class Jiuzhang(I2V):
"""
The model aims to transfer item and tokens to vector with Jiuzhang.
Bases
-------
I2V
Parameters
-----------
tokenizer: str
the tokenizer name
t2v: str
the name of token2vector model
args:
the parameters passed to t2v
tokenizer_kwargs: dict
the parameters passed to tokenizer
pretrained_t2v: bool
True: use pretrained t2v model
False: use your own t2v model
kwargs:
the parameters passed to t2v
Returns
-------
i2v model: Jiuzhang
"""

def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
*args, key=lambda x: x, return_tensors='pt', **kwargs) -> tuple:
"""
It is a function to switch item to vector. And before using the function, it is nesseary to load model.
Parameters
-----------
items : str or dict or list
the item of question, or question list
return_tensors: str
tensor type used in tokenizer
args:
the parameters passed to t2v
kwargs:
the parameters passed to t2v
Returns
--------
vector:list
"""
is_batch = isinstance(items, list)
items = items if is_batch else [items]
inputs = self.tokenize(items, key=key, return_tensors=return_tensors)
return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
model_path = model_path.replace(i, "")
logger.info("model_path: %s" % model_path)
tokenizer_kwargs = {"tokenizer_config_dir": model_path}
return cls("jiuzhang", name, pretrained_t2v=True, model_dir=model_dir, device=device,
tokenizer_kwargs=tokenizer_kwargs)


MODEL_MAP = {
"w2v": W2V,
"d2v": D2V,
"bert": Bert,
"hf_auto": HfAuto,
"disenq": DisenQ,
"quesnet": QuesNet,
"elmo": Elmo
"elmo": Elmo,
"jiuzhang": Jiuzhang,
}


Expand Down
1 change: 1 addition & 0 deletions EduNLP/ModelZoo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .rnn import *
from .disenqnet import *
from .quesnet import *
from .jiuzhang import *
2 changes: 2 additions & 0 deletions EduNLP/ModelZoo/jiuzhang/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .jiuzhang import *
from .modeling import CPTModel as JiuzhangModel
167 changes: 167 additions & 0 deletions EduNLP/ModelZoo/jiuzhang/jiuzhang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import torch
from torch import nn
import json
import os
from ..base_model import BaseModel
from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput
from transformers import PretrainedConfig
from typing import List
from ..rnn.harnn import HAM
from transformers import BartConfig as JiuzhangConfig
from .modeling import CPTModel as JiuzhangModel


__all__ = ["JiuzhangForPropertyPrediction", "JiuzhangForKnowledgePrediction"]


class JiuzhangForPropertyPrediction(BaseModel):
def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True):
super(JiuzhangForPropertyPrediction, self).__init__()
jiuzhang_config = JiuzhangConfig.from_pretrained(pretrained_model_dir)
if init:
print(f'Load Jiuzhang from checkpoint: {pretrained_model_dir}')
self.jiuzhang = JiuzhangModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)
else:
print(f'Load Jiuzhang from config: {pretrained_model_dir}')
self.jiuzhang = JiuzhangModel(jiuzhang_config)
self.hidden_size = self.jiuzhang.config.hidden_size
self.head_dropout = head_dropout
self.dropout = nn.Dropout(head_dropout)
self.classifier = nn.Linear(self.hidden_size, 1)
self.sigmoid = nn.Sigmoid()
self.criterion = nn.MSELoss()

self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "jiuzhang_config"]}
self.config['architecture'] = 'JiuzhangForPropertyPrediction'
self.config = PretrainedConfig.from_dict(self.config)

def forward(self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
labels=None):
outputs = self.jiuzhang(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
# outputs = self.jiuzhang(input_ids=input_ids, attention_mask=attention_mask)
item_embeds = outputs.last_hidden_state[:, 0, :]
item_embeds = self.dropout(item_embeds)

logits = self.sigmoid(self.classifier(item_embeds)).squeeze(1)
loss = None
if labels is not None:
loss = self.criterion(logits, labels) if labels is not None else None
return PropertyPredictionOutput(
loss=loss,
logits=logits,
)

@classmethod
def from_config(cls, config_path, **kwargs):
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
with open(config_path, "r", encoding="utf-8") as rf:
model_config = json.load(rf)
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
model_config.update(kwargs)
return cls(
pretrained_model_dir=model_config['pretrained_model_dir'],
head_dropout=model_config.get("head_dropout", 0.5),
init=model_config.get('init', False)
)

def save_config(self, config_dir):
config_path = os.path.join(config_dir, "model_config.json")
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
self.jiuzhang.config.save_pretrained(config_dir)


class JiuzhangForKnowledgePrediction(BaseModel):
def __init__(self,
pretrained_model_dir=None,
num_classes_list: List[int] = None,
num_total_classes: int = None,
head_dropout=0.5,
flat_cls_weight=0.5,
attention_unit_size=256,
fc_hidden_size=512,
beta=0.5,
init=True
):
super(JiuzhangForKnowledgePrediction, self).__init__()
jiuzhang_config = JiuzhangConfig.from_pretrained(pretrained_model_dir)
if init:
print(f'Load Jiuzhang from checkpoint: {pretrained_model_dir}')
self.jiuzhang = JiuzhangModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)
else:
print(f'Load Jiuzhang from config: {pretrained_model_dir}')
self.jiuzhang = JiuzhangModel(jiuzhang_config)
self.hidden_size = self.jiuzhang.config.hidden_size
self.head_dropout = head_dropout
self.dropout = nn.Dropout(head_dropout)
self.sigmoid = nn.Sigmoid()
self.criterion = nn.MSELoss()
self.flat_classifier = nn.Linear(self.hidden_size, num_total_classes)
self.ham_classifier = HAM(
num_classes_list=num_classes_list,
num_total_classes=num_total_classes,
sequence_model_hidden_size=self.jiuzhang.config.hidden_size,
attention_unit_size=attention_unit_size,
fc_hidden_size=fc_hidden_size,
beta=beta,
dropout_rate=head_dropout
)
self.flat_cls_weight = flat_cls_weight
self.num_classes_list = num_classes_list
self.num_total_classes = num_total_classes

self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "jiuzhang_config"]}
self.config['architecture'] = 'JiuzhangForKnowledgePrediction'
self.config = PretrainedConfig.from_dict(self.config)

def forward(self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
labels=None):
outputs = self.jiuzhang(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
item_embeds = outputs.last_hidden_state[:, 0, :]
item_embeds = self.dropout(item_embeds)
tokens_embeds = outputs.last_hidden_state
tokens_embeds = self.dropout(tokens_embeds)
flat_logits = self.sigmoid(self.flat_classifier(item_embeds))
ham_outputs = self.ham_classifier(tokens_embeds)
ham_logits = self.sigmoid(ham_outputs.scores)
logits = self.flat_cls_weight * flat_logits + (1 - self.flat_cls_weight) * ham_logits
loss = None
if labels is not None:
labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1)
labels = labels.float()
loss = self.criterion(logits, labels) if labels is not None else None
return KnowledgePredictionOutput(
loss=loss,
logits=logits,
)

@classmethod
def from_config(cls, config_path, **kwargs):
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
with open(config_path, "r", encoding="utf-8") as rf:
model_config = json.load(rf)
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
model_config.update(kwargs)
return cls(
pretrained_model_dir=model_config['pretrained_model_dir'],
head_dropout=model_config.get("head_dropout", 0.5),
num_classes_list=model_config.get('num_classes_list'),
num_total_classes=model_config.get('num_total_classes'),
flat_cls_weight=model_config.get('flat_cls_weight', 0.5),
attention_unit_size=model_config.get('attention_unit_size', 256),
fc_hidden_size=model_config.get('fc_hidden_size', 512),
beta=model_config.get('beta', 0.5),
init=model_config.get('init', False)
)

def save_config(self, config_dir):
config_path = os.path.join(config_dir, "model_config.json")
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
self.jiuzhang.config.save_pretrained(config_dir)
Loading

0 comments on commit ddc432a

Please sign in to comment.