import os
import torch
from torch. utils. data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn. metrics import accuracy_score, classification_report
from tqdm import tqdm
class CustomDataset ( Dataset) :
def __init__ ( self, texts, labels, tokenizer, max_length= 128 ) :
self. texts = texts
self. labels = labels
self. tokenizer = tokenizer
self. max_length = max_length
def __len__ ( self) :
return len ( self. texts)
def __getitem__ ( self, idx) :
text = self. texts[ idx]
label = self. labels[ idx]
encoding = self. tokenizer(
text,
max_length= self. max_length,
padding= "max_length" ,
truncation= True ,
return_tensors= "pt"
)
return {
"input_ids" : encoding[ "input_ids" ] . squeeze( 0 ) ,
"attention_mask" : encoding[ "attention_mask" ] . squeeze( 0 ) ,
"label" : torch. tensor( label, dtype= torch. long )
}
def train_model ( model, train_loader, optimizer, device, num_epochs= 3 ) :
model. train( )
for epoch in range ( num_epochs) :
total_loss = 0
for batch in tqdm( train_loader, desc= f"Training Epoch { epoch + 1 } / { num_epochs} " ) :
input_ids = batch[ "input_ids" ] . to( device)
attention_mask = batch[ "attention_mask" ] . to( device)
labels = batch[ "label" ] . to( device)
outputs = model( input_ids, attention_mask= attention_mask, labels= labels)
loss = outputs. loss
total_loss += loss. item( )
optimizer. zero_grad( )
loss. backward( )
optimizer. step( )
print ( f"Epoch { epoch + 1 } Loss: { total_loss / len ( train_loader) } " )
def evaluate_model ( model, val_loader, device) :
model. eval ( )
predictions, true_labels = [ ] , [ ]
with torch. no_grad( ) :
for batch in val_loader:
input_ids = batch[ "input_ids" ] . to( device)
attention_mask = batch[ "attention_mask" ] . to( device)
labels = batch[ "label" ] . to( device)
outputs = model( input_ids, attention_mask= attention_mask)
logits = outputs. logits
preds = torch. argmax( logits, dim= 1 ) . cpu( ) . numpy( )
predictions. extend( preds)
true_labels. extend( labels. cpu( ) . numpy( ) )
accuracy = accuracy_score( true_labels, predictions)
report = classification_report( true_labels, predictions)
print ( f"Validation Accuracy: { accuracy} " )
print ( "Classification Report:" )
print ( report)
def save_model ( model, tokenizer, output_dir) :
os. makedirs( output_dir, exist_ok= True )
model. save_pretrained( output_dir)
tokenizer. save_pretrained( output_dir)
print ( f"Model saved to { output_dir} " )
def load_model ( output_dir, device) :
tokenizer = BertTokenizer. from_pretrained( output_dir)
model = BertForSequenceClassification. from_pretrained( output_dir)
model. to( device)
print ( f"Model loaded from { output_dir} " )
return model, tokenizer
def predict ( texts, model, tokenizer, device, max_length= 128 ) :
model. eval ( )
encodings = tokenizer(
texts,
max_length= max_length,
padding= "max_length" ,
truncation= True ,
return_tensors= "pt"
)
input_ids = encodings[ "input_ids" ] . to( device)
attention_mask = encodings[ "attention_mask" ] . to( device)
with torch. no_grad( ) :
outputs = model( input_ids, attention_mask= attention_mask)
logits = outputs. logits
probabilities = torch. softmax( logits, dim= 1 ) . cpu( ) . numpy( )
predictions = torch. argmax( logits, dim= 1 ) . cpu( ) . numpy( )
return predictions, probabilities
def main ( ) :
config = {
"train_batch_size" : 16 ,
"val_batch_size" : 16 ,
"learning_rate" : 5e-5 ,
"num_epochs" : 5 ,
"max_length" : 128 ,
"device_id" : 7 ,
"model_dir" : "model" ,
"local_model_path" : "roberta_tiny_model" ,
"pretrained_model_name" : "uer/chinese_roberta_L-12_H-128" ,
}
device = torch. device( f"cuda: { config[ 'device_id' ] } " if torch. cuda. is_available( ) else "cpu" )
print ( f"Using device: { device} " )
tokenizer = BertTokenizer. from_pretrained( config[ "local_model_path" ] )
model = BertForSequenceClassification. from_pretrained( config[ "local_model_path" ] , num_labels= 2 )
model. to( device)
train_texts = [ "This is a great product!" , "I hate this service." ]
train_labels = [ 1 , 0 ]
val_texts = [ "Awesome experience." , "Terrible product." ]
val_labels = [ 1 , 0 ]
train_dataset = CustomDataset( train_texts, train_labels, tokenizer, config[ "max_length" ] )
val_dataset = CustomDataset( val_texts, val_labels, tokenizer, config[ "max_length" ] )
train_loader = DataLoader( train_dataset, batch_size= config[ "train_batch_size" ] , shuffle= True )
val_loader = DataLoader( val_dataset, batch_size= config[ "val_batch_size" ] )
optimizer = AdamW( model. parameters( ) , lr= config[ "learning_rate" ] )
train_model( model, train_loader, optimizer, device, num_epochs= config[ "num_epochs" ] )
evaluate_model( model, val_loader, device)
save_model( model, tokenizer, config[ "model_dir" ] )
loaded_model, loaded_tokenizer = load_model( config[ "model_dir" ] , "cpu" )
new_texts = [ "I love this!" , "It's the worst." ]
predictions, probabilities = predict( new_texts, loaded_model, loaded_tokenizer, "cpu" )
for text, pred, prob in zip ( new_texts, predictions, probabilities) :
print ( f"Text: { text} " )
print ( f"Predicted Label: { pred} (Probability: { prob} )" )
if __name__ == "__main__" :
main( )