基于MindSpore通过GPT实现情感分类
% % capture captured_output
!pip uninstall mindspore - y
!pip install - i https: // pypi. mirrors. ustc. edu. cn/ simple mindspore== 2.2 .14
!pip install mindnlp
!pip install jieba
% env HF_ENDPOINT= https: // hf- mirror. com
import os
import mindspore
from mindspore. dataset import text, GeneratorDataset, transforms
from mindspore import nn
from mindnlp. dataset import load_dataset
from mindnlp. _legacy. engine import Trainer, Evaluator
from mindnlp. _legacy. engine. callbacks import CheckpointCallback, BestModelCallback
from mindnlp. _legacy. metrics import Accuracy
imdb_ds = load_dataset( 'imdb' , split= [ 'train' , 'test' ] )
imdb_train = imdb_ds[ 'train' ]
imdb_test = imdb_ds[ 'test' ]
imdb_train. get_dataset_size( )
import numpy as np
def process_dataset ( dataset, tokenizer, max_seq_len= 512 , batch_size= 4 , shuffle= False ) :
is_ascend = mindspore. get_context( 'device_target' ) == 'Ascend'
def tokenize ( text) :
if is_ascend:
tokenized = tokenizer( text, padding= 'max_length' , truncation= True , max_length= max_seq_len)
else :
tokenized = tokenizer( text, truncation= True , max_length= max_seq_len)
return tokenized[ 'input_ids' ] , tokenized[ 'attention_mask' ]
if shuffle:
dataset = dataset. shuffle( batch_size)
dataset = dataset. map ( operations= [ tokenize] , input_columns= "text" , output_columns= [ 'input_ids' , 'attention_mask' ] )
dataset = dataset. map ( operations= transforms. TypeCast( mindspore. int32) , input_columns= "label" , output_columns= "labels" )
if is_ascend:
dataset = dataset. batch( batch_size)
else :
dataset = dataset. padded_batch( batch_size, pad_info= { 'input_ids' : ( None , tokenizer. pad_token_id) ,
'attention_mask' : ( None , 0 ) } )
return dataset
from mindnlp. transformers import GPTTokenizer
gpt_tokenizer = GPTTokenizer. from_pretrained( 'openai-gpt' )
special_tokens_dict = {
"bos_token" : "<bos>" ,
"eos_token" : "<eos>" ,
"pad_token" : "<pad>" ,
}
num_added_toks = gpt_tokenizer. add_special_tokens( special_tokens_dict)
imdb_train, imdb_val = imdb_train. split( [ 0.7 , 0.3 ] )
dataset_train = process_dataset( imdb_train, gpt_tokenizer, shuffle= True )
dataset_val = process_dataset( imdb_val, gpt_tokenizer)
dataset_test = process_dataset( imdb_test, gpt_tokenizer)
next ( dataset_train. create_tuple_iterator( ) )
from mindnlp. transformers import GPTForSequenceClassification
from mindspore. experimental. optim import Adam
model = GPTForSequenceClassification. from_pretrained( 'openai-gpt' , num_labels= 2 )
model. config. pad_token_id = gpt_tokenizer. pad_token_id
model. resize_token_embeddings( model. config. vocab_size + 3 )
optimizer = nn. Adam( model. trainable_params( ) , learning_rate= 2e-5 )
metric = Accuracy( )
ckpoint_cb = CheckpointCallback( save_path= 'checkpoint' , ckpt_name= 'gpt_imdb_finetune' , epochs= 1 , keep_checkpoint_max= 2 )
best_model_cb = BestModelCallback( save_path= 'checkpoint' , ckpt_name= 'gpt_imdb_finetune_best' , auto_load= True )
trainer = Trainer( network= model, train_dataset= dataset_train,
eval_dataset= dataset_train, metrics= metric,
epochs= 1 , optimizer= optimizer, callbacks= [ ckpoint_cb, best_model_cb] ,
jit= False )
trainer. run( tgt_columns= "labels" )
evaluator = Evaluator( network= model, eval_dataset= dataset_test, metrics= metric)
evaluator. run( tgt_columns= "labels" )